ClaudeForge/skill/analyzer.py

"""
CLAUDE.md File Analyzer

Analyzes existing CLAUDE.md files to identify structure, sections, and quality issues.
Provides detailed analysis reports with quality scores and actionable recommendations.
"""

from typing import Dict, List, Any, Tuple
import re


class CLAUDEMDAnalyzer:
    """Analyzes CLAUDE.md files for structure, completeness, and quality."""

    # Standard sections that should be present in most CLAUDE.md files
    RECOMMENDED_SECTIONS = [
        "Quick Navigation",
        "Core Principles",
        "Tech Stack",
        "Workflow Instructions",
        "Quality Checklist",
        "File Organization",
        "Common Commands",
        "References"
    ]

    # Optional but valuable sections
    OPTIONAL_SECTIONS = [
        "Testing Requirements",
        "Error Handling Patterns",
        "Documentation Standards",
        "Performance Guidelines",
        "Security Checklist",
        "Deployment Process",
        "Troubleshooting"
    ]

    def __init__(self, content: str):
        """
        Initialize analyzer with CLAUDE.md file content.

        Args:
            content: Full text content of CLAUDE.md file
        """
        self.content = content
        self.lines = content.split('\n')
        self.line_count = len(self.lines)
        self.char_count = len(content)
        self.sections = []
        self.subsections = []

    def analyze_file(self) -> Dict[str, Any]:
        """
        Perform comprehensive analysis of CLAUDE.md file.

        Returns:
            Dictionary containing full analysis results
        """
        return {
            "file_metrics": self._get_file_metrics(),
            "sections_found": self.detect_sections(),
            "missing_sections": self._identify_missing_sections(),
            "structure_analysis": self._analyze_structure(),
            "issues": self._detect_issues(),
            "quality_score": self.calculate_quality_score(),
            "recommendations": self.generate_recommendations()
        }

    def _get_file_metrics(self) -> Dict[str, int]:
        """Calculate basic file metrics."""
        return {
            "char_count": self.char_count,
            "line_count": self.line_count,
            "word_count": len(self.content.split()),
            "heading_count": len([line for line in self.lines if line.startswith('#')]),
            "code_block_count": self.content.count('```') // 2
        }

    def detect_sections(self) -> List[str]:
        """
        Detect all sections (headings) in the file.

        Returns:
            List of section titles found
        """
        sections = []
        subsections = []

        for line in self.lines:
            # Match markdown headings (## or ###)
            if line.startswith('## '):
                section_title = line[3:].strip()
                sections.append(section_title)
            elif line.startswith('### '):
                subsection_title = line[4:].strip()
                subsections.append(subsection_title)

        self.sections = sections
        self.subsections = subsections
        return sections

    def _identify_missing_sections(self) -> List[str]:
        """
        Identify recommended sections that are missing.

        Returns:
            List of missing section names
        """
        if not self.sections:
            self.detect_sections()

        missing = []
        for recommended in self.RECOMMENDED_SECTIONS:
            # Check if section exists (case-insensitive, partial match)
            if not any(recommended.lower() in section.lower() for section in self.sections):
                missing.append(recommended)

        return missing

    def _analyze_structure(self) -> Dict[str, Any]:
        """
        Analyze the structural quality of the file.

        Returns:
            Dictionary with structure analysis
        """
        has_title = self.content.startswith('# ')
        has_navigation = any('navigation' in s.lower() for s in self.sections)
        has_code_examples = '```' in self.content
        has_links = '[' in self.content and '](' in self.content

        # Check for modular architecture mentions
        mentions_modular = any(
            keyword in self.content.lower()
            for keyword in ['backend/CLAUDE.md', 'frontend/CLAUDE.md', 'subdirectory', 'context-specific']
        )

        return {
            "has_main_title": has_title,
            "has_navigation_section": has_navigation,
            "has_code_examples": has_code_examples,
            "has_links": has_links,
            "mentions_modular_architecture": mentions_modular,
            "section_count": len(self.sections),
            "subsection_count": len(self.subsections),
            "hierarchy_depth": self._calculate_hierarchy_depth()
        }

    def _calculate_hierarchy_depth(self) -> int:
        """Calculate maximum heading depth."""
        max_depth = 1  # Assumes at least # title
        for line in self.lines:
            if line.startswith('#'):
                depth = len(line) - len(line.lstrip('#'))
                max_depth = max(max_depth, depth)
        return max_depth

    def _detect_issues(self) -> List[Dict[str, str]]:
        """
        Detect potential issues with the file.

        Returns:
            List of issue dictionaries with type, severity, and message
        """
        issues = []

        # Check file length
        if self.line_count > 250:
            issues.append({
                "type": "length_critical",
                "severity": "high",
                "message": f"File is too long ({self.line_count} lines). Hard cap is 150; split into modular files."
            })
        elif self.line_count > 150:
            issues.append({
                "type": "length_warning",
                "severity": "high",
                "message": f"File exceeds the 150-line cap ({self.line_count} lines). Split now."
            })
        elif self.line_count > 120:
            issues.append({
                "type": "length_warning",
                "severity": "medium",
                "message": f"File is approaching the 150-line cap ({self.line_count} lines)."
            })

        # Check if file is too short
        if self.line_count < 30:
            issues.append({
                "type": "too_short",
                "severity": "medium",
                "message": f"File is very short ({self.line_count} lines). May need more guidance."
            })

        # Check for missing critical sections
        critical_sections = ["Core Principles", "Tech Stack", "Workflow"]
        missing_critical = [
            s for s in critical_sections
            if not any(s.lower() in section.lower() for section in self.sections)
        ]

        if missing_critical:
            issues.append({
                "type": "missing_critical_sections",
                "severity": "high",
                "message": f"Missing critical sections: {', '.join(missing_critical)}"
            })

        # Check for placeholder text
        placeholders = ['TODO', 'TBD', 'FIXME', '[Insert', '[Add']
        for placeholder in placeholders:
            if placeholder in self.content:
                issues.append({
                    "type": "placeholder_text",
                    "severity": "medium",
                    "message": f"Contains placeholder text: '{placeholder}'"
                })
                break

        # Check for empty sections
        empty_section_pattern = r'##\s+[^\n]+\n\s*\n\s*##'
        if re.search(empty_section_pattern, self.content):
            issues.append({
                "type": "empty_sections",
                "severity": "low",
                "message": "Some sections appear to be empty"
            })

        return issues

    def calculate_quality_score(self) -> int:
        """
        Calculate overall quality score (0-100).

        Scoring breakdown:
        - Length appropriateness: 25 points
        - Section completeness: 25 points
        - Formatting quality: 20 points
        - Content specificity: 15 points
        - Modular organization: 15 points

        Returns:
            Quality score between 0 and 100
        """
        score = 0

        # Length appropriateness (25 points). Hard cap is 150 lines; anything
        # above that loses points sharply because it indicates context bloat.
        if 50 <= self.line_count <= 150:
            score += 25
        elif 30 <= self.line_count < 50 or 150 < self.line_count <= 200:
            score += 15
        elif self.line_count > 200:
            score += 5
        else:
            score += 10

        # Section completeness (25 points)
        if not self.sections:
            self.detect_sections()

        found_count = len([
            s for s in self.RECOMMENDED_SECTIONS
            if any(s.lower() in section.lower() for section in self.sections)
        ])
        section_score = (found_count / len(self.RECOMMENDED_SECTIONS)) * 25
        score += int(section_score)

        # Formatting quality (20 points)
        formatting_score = 0
        if self.content.startswith('# '):
            formatting_score += 5
        if '```' in self.content:
            formatting_score += 5
        if '[' in self.content and '](' in self.content:
            formatting_score += 5
        if any('navigation' in s.lower() for s in self.sections):
            formatting_score += 5
        score += formatting_score

        # Content specificity (15 points)
        # Check for specific tech mentions (not generic)
        tech_keywords = [
            'typescript', 'python', 'react', 'vue', 'angular', 'node',
            'fastapi', 'django', 'postgresql', 'mongodb', 'docker'
        ]
        content_lower = self.content.lower()
        tech_mentions = sum(1 for keyword in tech_keywords if keyword in content_lower)

        if tech_mentions >= 3:
            score += 15
        elif tech_mentions >= 2:
            score += 10
        elif tech_mentions >= 1:
            score += 5

        # Modular organization (15 points)
        modular_keywords = [
            'backend/CLAUDE.md', 'frontend/CLAUDE.md', 'context-specific',
            'subdirectory', 'modular'
        ]
        modular_mentions = sum(1 for keyword in modular_keywords if keyword.lower() in content_lower)

        if modular_mentions >= 2:
            score += 15
        elif modular_mentions >= 1:
            score += 10

        return min(score, 100)

    def generate_recommendations(self) -> List[str]:
        """
        Generate actionable recommendations for improvement.

        Returns:
            List of recommendation strings
        """
        recommendations = []

        # Analyze first to ensure data is available
        if not self.sections:
            self.detect_sections()

        missing = self._identify_missing_sections()
        issues = self._detect_issues()

        # Critical issues first
        for issue in issues:
            if issue['severity'] == 'high':
                if issue['type'] == 'length_critical':
                    recommendations.append(
                        "CRITICAL: Split into modular files - create backend/CLAUDE.md, "
                        "frontend/CLAUDE.md, etc."
                    )
                elif issue['type'] == 'missing_critical_sections':
                    recommendations.append(f"CRITICAL: {issue['message']}")

        # Length recommendations
        if self.line_count > 150:
            recommendations.append(
                "Reduce this CLAUDE.md to <=150 lines (hard cap) - move detail to context-specific files and chain them via @path imports"
            )
        elif self.line_count < 30:
            recommendations.append(
                "Expand with essential sections: Core Principles, Tech Stack, Workflow Instructions"
            )

        # Missing sections
        if missing:
            high_priority = ["Core Principles", "Tech Stack", "Workflow Instructions"]
            missing_high_priority = [s for s in missing if s in high_priority]

            if missing_high_priority:
                recommendations.append(
                    f"Add essential sections: {', '.join(missing_high_priority)}"
                )

            missing_optional = [s for s in missing if s not in high_priority]
            if len(missing_optional) <= 3:
                recommendations.append(
                    f"Consider adding: {', '.join(missing_optional)}"
                )

        # Structure recommendations
        structure = self._analyze_structure()
        if not structure['has_navigation_section'] and self.line_count > 100:
            recommendations.append(
                "Add Quick Navigation section with links to context-specific guides"
            )

        if not structure['has_code_examples']:
            recommendations.append(
                "Include code examples for complex patterns to improve clarity"
            )

        # Modular architecture
        if self.line_count > 200 and not structure['mentions_modular_architecture']:
            recommendations.append(
                "Consider implementing modular architecture - separate files for major components"
            )

        # Quality improvements
        quality_score = self.calculate_quality_score()
        if quality_score < 60:
            recommendations.append(
                f"Overall quality score is {quality_score}/100 - prioritize critical improvements"
            )

        return recommendations[:8]  # Limit to top 8 recommendations