fss-mini-rag-github/mini_rag/explorer.py

#!/usr/bin/env python3
"""
Interactive Code Explorer with Thinking Mode

Provides multi-turn conversations with context memory for debugging and learning.
Perfect for exploring codebases with detailed reasoning and follow-up questions.
"""

import json
import logging
import time
from typing import List, Dict, Any, Optional
from pathlib import Path
from dataclasses import dataclass

try:
    from .llm_synthesizer import LLMSynthesizer, SynthesisResult
    from .search import CodeSearcher
    from .config import RAGConfig
except ImportError:
    # For direct testing
    from llm_synthesizer import LLMSynthesizer, SynthesisResult
    from search import CodeSearcher
    from config import RAGConfig

logger = logging.getLogger(__name__)

@dataclass
class ExplorationSession:
    """Track an exploration session with context history."""
    project_path: Path
    conversation_history: List[Dict[str, Any]]
    session_id: str
    started_at: float

    def add_exchange(self, question: str, search_results: List[Any], response: SynthesisResult):
        """Add a question/response exchange to the conversation history."""
        self.conversation_history.append({
            "timestamp": time.time(),
            "question": question,
            "search_results_count": len(search_results),
            "response": {
                "summary": response.summary,
                "key_points": response.key_points,
                "code_examples": response.code_examples,
                "suggested_actions": response.suggested_actions,
                "confidence": response.confidence
            }
        })

class CodeExplorer:
    """Interactive code exploration with thinking and context memory."""

    def __init__(self, project_path: Path, config: RAGConfig = None):
        self.project_path = project_path
        self.config = config or RAGConfig()

        # Initialize components with thinking enabled
        self.searcher = CodeSearcher(project_path)
        self.synthesizer = LLMSynthesizer(
            ollama_url=f"http://{self.config.llm.ollama_host}",
            model=self.config.llm.synthesis_model,
            enable_thinking=True,  # Always enable thinking in explore mode
            config=self.config  # Pass config for model rankings
        )

        # Session management
        self.current_session: Optional[ExplorationSession] = None

    def start_exploration_session(self) -> bool:
        """Start a new exploration session."""

        # Simple availability check - don't do complex model restart logic
        if not self.synthesizer.is_available():
            print("❌ LLM service unavailable. Please check Ollama is running.")
            return False

        session_id = f"explore_{int(time.time())}"
        self.current_session = ExplorationSession(
            project_path=self.project_path,
            conversation_history=[],
            session_id=session_id,
            started_at=time.time()
        )

        print("🧠 Exploration Mode Started")
        print(f"Project: {self.project_path.name}")

        return True

    def explore_question(self, question: str, context_limit: int = 10) -> Optional[str]:
        """Explore a question with full thinking and context."""
        if not self.current_session:
            return "❌ No exploration session active. Start one first."

        # Search for relevant information
        search_start = time.time()
        results = self.searcher.search(
            question,
            top_k=context_limit,
            include_context=True,
            semantic_weight=0.7,
            bm25_weight=0.3
        )
        search_time = time.time() - search_start

        # Build enhanced prompt with conversation context
        synthesis_prompt = self._build_contextual_prompt(question, results)

        # Get thinking-enabled analysis
        synthesis_start = time.time()
        synthesis = self._synthesize_with_context(synthesis_prompt, results)
        synthesis_time = time.time() - synthesis_start

        # Add to conversation history
        self.current_session.add_exchange(question, results, synthesis)

        # Streaming already displayed the response
        # Just return minimal status for caller
        session_duration = time.time() - self.current_session.started_at
        exchange_count = len(self.current_session.conversation_history)

        status = f"\n📊 Session: {session_duration/60:.1f}m | Question #{exchange_count} | Results: {len(results)} | Time: {search_time+synthesis_time:.1f}s"
        return status

    def _build_contextual_prompt(self, question: str, results: List[Any]) -> str:
        """Build a prompt that includes conversation context."""
        # Get recent conversation context (last 3 exchanges)
        context_summary = ""
        if self.current_session.conversation_history:
            recent_exchanges = self.current_session.conversation_history[-3:]
            context_parts = []

            for i, exchange in enumerate(recent_exchanges, 1):
                prev_q = exchange["question"]
                prev_summary = exchange["response"]["summary"]
                context_parts.append(f"Previous Q{i}: {prev_q}")
                context_parts.append(f"Previous A{i}: {prev_summary}")

            context_summary = "\n".join(context_parts)

        # Build search results context
        results_context = []
        for i, result in enumerate(results[:8], 1):
            file_path = result.file_path if hasattr(result, 'file_path') else 'unknown'
            content = result.content if hasattr(result, 'content') else str(result)
            score = result.score if hasattr(result, 'score') else 0.0

            results_context.append(f"""
Result {i} (Score: {score:.3f}):
File: {file_path}
Content: {content[:800]}{'...' if len(content) > 800 else ''}
""")

        results_text = "\n".join(results_context)

        # Create comprehensive exploration prompt with thinking
        prompt = f"""<think>
The user asked: "{question}"

Let me analyze what they're asking and look at the information I have available.

From the search results, I can see relevant information about:
{results_text[:500]}...

I should think about:
1. What the user is trying to understand or accomplish
2. What information from the search results is most relevant
3. How to explain this in a clear, educational way
4. What practical next steps would be helpful

Based on our conversation so far: {context_summary}

Let me create a helpful response that breaks this down clearly and gives them actionable guidance.
</think>

You're a helpful assistant exploring a project with someone. You're good at breaking down complex topics into understandable pieces and explaining things clearly.

PROJECT: {self.project_path.name}

PREVIOUS CONVERSATION:
{context_summary}

CURRENT QUESTION: "{question}"

RELEVANT INFORMATION FOUND:
{results_text}

Please provide a helpful, natural explanation that answers their question. Write as if you're having a friendly conversation with a colleague who's exploring this project.

Structure your response to include:
1. A clear explanation of what you found and how it answers their question
2. The most important insights from the information you discovered
3. Relevant examples or code patterns when helpful
4. Practical next steps they could take

Guidelines:
- Write in a conversational, friendly tone
- Be educational but not condescending
- Reference specific files and information when helpful
- Give practical, actionable suggestions
- Connect everything back to their original question
- Use natural language, not structured formats
- Break complex topics into understandable pieces
"""

        return prompt

    def _synthesize_with_context(self, prompt: str, results: List[Any]) -> SynthesisResult:
        """Synthesize results with full context and thinking."""
        try:
            # Use streaming with thinking visible (don't collapse)
            response = self.synthesizer._call_ollama(prompt, temperature=0.2, disable_thinking=False, use_streaming=True, collapse_thinking=False)
            thinking_stream = ""

            # Streaming already shows thinking and response
            # No need for additional indicators

            if not response:
                return SynthesisResult(
                    summary="Analysis unavailable (LLM service error)",
                    key_points=[],
                    code_examples=[],
                    suggested_actions=["Check LLM service status"],
                    confidence=0.0
                )

            # Use natural language response directly
            return SynthesisResult(
                summary=response.strip(),
                key_points=[],  # Not used with natural language responses
                code_examples=[],  # Not used with natural language responses
                suggested_actions=[],  # Not used with natural language responses
                confidence=0.85  # High confidence for natural responses
            )

        except Exception as e:
            logger.error(f"Context synthesis failed: {e}")
            return SynthesisResult(
                summary="Analysis failed due to service error",
                key_points=[],
                code_examples=[],
                suggested_actions=["Check system status and try again"],
                confidence=0.0
            )

    def _format_exploration_response(self, question: str, synthesis: SynthesisResult,
                                   result_count: int, search_time: float, synthesis_time: float) -> str:
        """Format exploration response with context indicators."""

        output = []

        # Header with session context
        session_duration = time.time() - self.current_session.started_at
        exchange_count = len(self.current_session.conversation_history)

        output.append(f"🧠 EXPLORATION ANALYSIS (Question #{exchange_count})")
        output.append(f"Session: {session_duration/60:.1f}m | Results: {result_count} | "
                     f"Time: {search_time+synthesis_time:.1f}s")
        output.append("=" * 60)
        output.append("")

        # Response was already displayed via streaming
        # Just show completion status
        output.append("✅ Analysis complete")
        output.append("")
        output.append("")

        # Confidence and context indicator
        confidence_emoji = "🟢" if synthesis.confidence > 0.7 else "🟡" if synthesis.confidence > 0.4 else "🔴"
        context_indicator = f" | Context: {exchange_count-1} previous questions" if exchange_count > 1 else ""
        output.append(f"{confidence_emoji} Confidence: {synthesis.confidence:.1%}{context_indicator}")

        return "\n".join(output)

    def get_session_summary(self) -> str:
        """Get a summary of the current exploration session."""
        if not self.current_session:
            return "No active exploration session."

        duration = time.time() - self.current_session.started_at
        exchange_count = len(self.current_session.conversation_history)

        summary = [
            f"🧠 EXPLORATION SESSION SUMMARY",
            f"=" * 40,
            f"Project: {self.project_path.name}",
            f"Session ID: {self.current_session.session_id}",
            f"Duration: {duration/60:.1f} minutes",
            f"Questions explored: {exchange_count}",
            f"",
        ]

        if exchange_count > 0:
            summary.append("📋 Topics explored:")
            for i, exchange in enumerate(self.current_session.conversation_history, 1):
                question = exchange["question"][:50] + "..." if len(exchange["question"]) > 50 else exchange["question"]
                confidence = exchange["response"]["confidence"]
                summary.append(f"   {i}. {question} (confidence: {confidence:.1%})")

        return "\n".join(summary)

    def end_session(self) -> str:
        """End the current exploration session."""
        if not self.current_session:
            return "No active session to end."

        summary = self.get_session_summary()
        self.current_session = None

        return summary + "\n\n✅ Exploration session ended."

    def _check_model_restart_needed(self) -> bool:
        """Check if model restart would improve thinking quality."""
        try:
            # Simple heuristic: if we can detect the model was recently used
            # with <no_think>, suggest restart for better thinking quality

            # Test with a simple thinking prompt to see response quality
            test_response = self.synthesizer._call_ollama(
                "Think briefly: what is 2+2?",
                temperature=0.1,
                disable_thinking=False
            )

            if test_response:
                # If response is suspiciously short or shows signs of no-think behavior
                if len(test_response.strip()) < 10 or "4" == test_response.strip():
                    return True

        except Exception:
            pass

        return False

    def _handle_model_restart(self) -> bool:
        """Handle user confirmation and model restart."""
        try:
            print("\n🤔 To ensure best thinking quality, exploration mode works best with a fresh model.")
            print(f"   Currently running: {self.synthesizer.model}")
            print("\n💡 Stop current model and restart for optimal exploration? (y/N): ", end="", flush=True)

            response = input().strip().lower()

            if response in ['y', 'yes']:
                print("\n🔄 Stopping current model...")

                # Use ollama stop command for clean model restart
                import subprocess
                try:
                    subprocess.run([
                        "ollama", "stop", self.synthesizer.model
                    ], timeout=10, capture_output=True)

                    print("✅ Model stopped successfully.")
                    print("🚀 Exploration mode will restart the model with thinking enabled...")

                    # Reset synthesizer initialization to force fresh start
                    self.synthesizer._initialized = False
                    return True

                except subprocess.TimeoutExpired:
                    print("⚠️  Model stop timed out, continuing anyway...")
                    return False
                except FileNotFoundError:
                    print("⚠️  'ollama' command not found, continuing with current model...")
                    return False
                except Exception as e:
                    print(f"⚠️  Error stopping model: {e}")
                    return False
            else:
                print("📝 Continuing with current model...")
                return False

        except KeyboardInterrupt:
            print("\n📝 Continuing with current model...")
            return False
        except EOFError:
            print("\n📝 Continuing with current model...")
            return False

    def _call_ollama_with_thinking(self, prompt: str, temperature: float = 0.3) -> tuple:
        """Call Ollama with streaming for fast time-to-first-token."""
        import requests
        import json

        try:
            # Use the synthesizer's model and connection
            model_to_use = self.synthesizer.model
            if self.synthesizer.model not in self.synthesizer.available_models:
                if self.synthesizer.available_models:
                    model_to_use = self.synthesizer.available_models[0]
                else:
                    return None, None

            # Enable thinking by NOT adding <no_think>
            final_prompt = prompt

            # Get optimal parameters for this model
            from .llm_optimization import get_optimal_ollama_parameters
            optimal_params = get_optimal_ollama_parameters(model_to_use)

            payload = {
                "model": model_to_use,
                "prompt": final_prompt,
                "stream": True,  # Enable streaming for fast response
                "options": {
                    "temperature": temperature,
                    "top_p": optimal_params.get("top_p", 0.9),
                    "top_k": optimal_params.get("top_k", 40),
                    "num_ctx": optimal_params.get("num_ctx", 32768),
                    "num_predict": optimal_params.get("num_predict", 2000),
                    "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
                    "presence_penalty": optimal_params.get("presence_penalty", 1.0)
                }
            }

            response = requests.post(
                f"{self.synthesizer.ollama_url}/api/generate",
                json=payload,
                stream=True,
                timeout=65
            )

            if response.status_code == 200:
                # Collect streaming response
                raw_response = ""
                thinking_displayed = False

                for line in response.iter_lines():
                    if line:
                        try:
                            chunk_data = json.loads(line.decode('utf-8'))
                            chunk_text = chunk_data.get('response', '')

                            if chunk_text:
                                raw_response += chunk_text

                                # Display thinking stream as it comes in
                                if not thinking_displayed and '<think>' in raw_response:
                                    # Start displaying thinking
                                    self._start_thinking_display()
                                    thinking_displayed = True

                                if thinking_displayed:
                                    self._stream_thinking_chunk(chunk_text)

                            if chunk_data.get('done', False):
                                break

                        except json.JSONDecodeError:
                            continue

                # Finish thinking display if it was shown
                if thinking_displayed:
                    self._end_thinking_display()

                # Extract thinking stream and final response
                thinking_stream, final_response = self._extract_thinking(raw_response)

                return final_response, thinking_stream
            else:
                return None, None

        except Exception as e:
            logger.error(f"Thinking-enabled Ollama call failed: {e}")
            return None, None

    def _extract_thinking(self, raw_response: str) -> tuple:
        """Extract thinking content from response."""
        thinking_stream = ""
        final_response = raw_response

        # Look for thinking patterns
        if "<think>" in raw_response and "</think>" in raw_response:
            # Extract thinking content between tags
            start_tag = raw_response.find("<think>")
            end_tag = raw_response.find("</think>") + len("</think>")

            if start_tag != -1 and end_tag != -1:
                thinking_content = raw_response[start_tag + 7:end_tag - 8]  # Remove tags
                thinking_stream = thinking_content.strip()

                # Remove thinking from final response
                final_response = (raw_response[:start_tag] + raw_response[end_tag:]).strip()

        # Alternative patterns for models that use different thinking formats
        elif "Let me think" in raw_response or "I need to analyze" in raw_response:
            # Simple heuristic: first paragraph might be thinking
            lines = raw_response.split('\n')
            potential_thinking = []
            final_lines = []

            thinking_indicators = ["Let me think", "I need to", "First, I'll", "Looking at", "Analyzing"]
            in_thinking = False

            for line in lines:
                if any(indicator in line for indicator in thinking_indicators):
                    in_thinking = True
                    potential_thinking.append(line)
                elif in_thinking and (line.startswith('{') or line.startswith('**') or line.startswith('#')):
                    # Likely end of thinking, start of structured response
                    in_thinking = False
                    final_lines.append(line)
                elif in_thinking:
                    potential_thinking.append(line)
                else:
                    final_lines.append(line)

            if potential_thinking:
                thinking_stream = '\n'.join(potential_thinking).strip()
                final_response = '\n'.join(final_lines).strip()

        return thinking_stream, final_response

    def _start_thinking_display(self):
        """Start the thinking stream display."""
        print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")
        self._thinking_buffer = ""
        self._in_thinking_tags = False

    def _stream_thinking_chunk(self, chunk: str):
        """Stream a chunk of thinking as it arrives."""
        import sys

        self._thinking_buffer += chunk

        # Check if we're in thinking tags
        if '<think>' in self._thinking_buffer and not self._in_thinking_tags:
            self._in_thinking_tags = True
            # Display everything after <think>
            start_idx = self._thinking_buffer.find('<think>') + 7
            thinking_content = self._thinking_buffer[start_idx:]
            if thinking_content:
                print(f"\033[2m\033[3m{thinking_content}\033[0m", end='', flush=True)
        elif self._in_thinking_tags and '</think>' not in chunk:
            # We're in thinking mode, display the chunk
            print(f"\033[2m\033[3m{chunk}\033[0m", end='', flush=True)
        elif '</think>' in self._thinking_buffer:
            # End of thinking
            self._in_thinking_tags = False

    def _end_thinking_display(self):
        """End the thinking stream display."""
        print(f"\n\033[2m\033[3m" + "─" * 40 + "\033[0m")
        print()

    def _display_thinking_stream(self, thinking_stream: str):
        """Display thinking stream in light gray and italic (fallback for non-streaming)."""
        if not thinking_stream:
            return

        print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")

        # Split into paragraphs and display with proper formatting
        paragraphs = thinking_stream.split('\n\n')
        for para in paragraphs:
            if para.strip():
                # Wrap long lines nicely
                lines = para.strip().split('\n')
                for line in lines:
                    if line.strip():
                        # Light gray and italic
                        print(f"\033[2m\033[3m{line}\033[0m")
                print()  # Paragraph spacing

        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")
        print()

# Quick test function
def test_explorer():
    """Test the code explorer."""
    explorer = CodeExplorer(Path("."))

    if not explorer.start_exploration_session():
        print("❌ Could not start exploration session")
        return

    # Test question
    response = explorer.explore_question("How does authentication work in this codebase?")
    if response:
        print(response)

    print("\n" + explorer.end_session())

if __name__ == "__main__":
    test_explorer()