""" Multi-Language AST Parser Module Supports static analysis across multiple programming languages """ import re import json from typing import Dict, List, Optional, Any, Union from dataclasses import dataclass from enum import Enum class ProgrammingLanguage(Enum): PYTHON = "python" JAVASCRIPT = "javascript" TYPESCRIPT = "typescript" JAVA = "java" CPP = "cpp" CSHARP = "csharp" GO = "go" RUST = "rust" class IssueSeverity(Enum): LOW = "low" MEDIUM = "medium" HIGH = "high" CRITICAL = "critical" @dataclass class CodeIssue: """Structure for code analysis findings""" file_path: str line_number: int column: int severity: IssueSeverity rule_id: str message: str suggestion: str language: ProgrammingLanguage @dataclass class FunctionInfo: """Information about extracted functions""" name: str start_line: int end_line: int parameters: List[str] return_type: Optional[str] complexity: int class MultiLanguageParser: """AST parser for multiple programming languages""" def __init__(self): self.language_patterns = { ProgrammingLanguage.PYTHON: { "extension": ".py", "function_pattern": r"def\s+(\w+)\s*\([^)]*\)\s*:", "class_pattern": r"class\s+(\w+)\s*(?:\([^)]*\))?\s*:", "import_pattern": r"^(?:import|from)\s+", "comment_pattern": r"#.*$" }, ProgrammingLanguage.JAVASCRIPT: { "extension": ".js", "function_pattern": r"(?:function\s+(\w+)\s*\([^)]*\)|const\s+(\w+)\s*=\s*\([^)]*\)\s*=>)", "class_pattern": r"class\s+(\w+)\s*(?:extends\s+\w+)?\s*{", "import_pattern": r"^(?:import|const)\s+", "comment_pattern": r"//.*$|/\*.*?\*/" }, ProgrammingLanguage.JAVA: { "extension": ".java", "function_pattern": r"(?:public|private|protected)?\s*(?:static)?\s*\w+\s+(\w+)\s*\([^)]*\)\s*{", "class_pattern": r"(?:public\s+)?class\s+(\w+)\s*(?:extends\s+\w+)?\s*{", "import_pattern": r"^import\s+", "comment_pattern": r"//.*$|/\*.*?\*/" } } self.security_rules = { "hardcoded_secrets": { "pattern": r"(?:password|secret|key|token)\s*[=:]\s*[\"'][^\"']+[\"']", "severity": IssueSeverity.CRITICAL, "message": "Hardcoded secret detected" }, "sql_injection": { "pattern": r"(?:execute|query)\s*\(\s*[\"'][^\"']*\+[^\"']*[\"']", "severity": IssueSeverity.HIGH, "message": "Potential SQL injection vulnerability" }, "eval_usage": { "pattern": r"eval\s*\(", "severity": IssueSeverity.HIGH, "message": "Use of eval() function detected" }, "weak_crypto": { "pattern": r"(?:md5|sha1)\s*\(", "severity": IssueSeverity.MEDIUM, "message": "Weak cryptographic algorithm detected" } } def detect_language(self, file_path: str) -> Optional[ProgrammingLanguage]: """Detect programming language from file extension""" for lang, patterns in self.language_patterns.items(): if file_path.endswith(patterns["extension"]): return lang return None def parse_code(self, file_path: str, code: str) -> Dict[str, Any]: """Parse code and extract structural information""" language = self.detect_language(file_path) if not language: return {"error": "Unsupported language"} patterns = self.language_patterns[language] # Remove comments code_no_comments = self._remove_comments(code, patterns["comment_pattern"]) # Extract functions functions = self._extract_functions(code_no_comments, patterns["function_pattern"]) # Extract classes classes = self._extract_classes(code_no_comments, patterns["class_pattern"]) # Calculate complexity metrics complexity = self._calculate_complexity(code_no_comments, functions) return { "language": language.value, "functions": functions, "classes": classes, "complexity": complexity, "lines_of_code": len(code.splitlines()) } def analyze_security(self, file_path: str, code: str) -> List[CodeIssue]: """Perform security analysis on code""" language = self.detect_language(file_path) if not language: return [] issues = [] lines = code.splitlines() for rule_id, rule in self.security_rules.items(): for line_num, line in enumerate(lines, 1): if re.search(rule["pattern"], line, re.IGNORECASE): # Find column position match = re.search(rule["pattern"], line, re.IGNORECASE) column = match.start() + 1 if match else 1 issue = CodeIssue( file_path=file_path, line_number=line_num, column=column, severity=rule["severity"], rule_id=rule_id, message=rule["message"], suggestion=self._get_security_suggestion(rule_id), language=language ) issues.append(issue) return issues def analyze_quality(self, file_path: str, code: str) -> List[CodeIssue]: """Perform code quality analysis""" language = self.detect_language(file_path) if not language: return [] issues = [] lines = code.splitlines() # Long lines for line_num, line in enumerate(lines, 1): if len(line) > 120: issues.append(CodeIssue( file_path=file_path, line_number=line_num, column=120, severity=IssueSeverity.LOW, rule_id="long_line", message="Line too long (>120 characters)", suggestion="Break line into multiple lines", language=language )) # Complex functions (simplified) if language == ProgrammingLanguage.PYTHON: functions = self._extract_functions(code, self.language_patterns[language]["function_pattern"]) for func in functions: if func["complexity"] > 10: issues.append(CodeIssue( file_path=file_path, line_number=func["start_line"], column=1, severity=IssueSeverity.MEDIUM, rule_id="complex_function", message=f"Function '{func['name']}' is too complex (cyclomatic complexity: {func['complexity']})", suggestion="Consider breaking down into smaller functions", language=language )) return issues def _remove_comments(self, code: str, comment_pattern: str) -> str: """Remove comments from code""" lines = code.splitlines() cleaned_lines = [] for line in lines: # Remove inline comments cleaned = re.sub(comment_pattern, "", line) cleaned_lines.append(cleaned.rstrip()) return "\n".join(cleaned_lines) def _extract_functions(self, code: str, pattern: str) -> List[Dict[str, Any]]: """Extract function information from code""" functions = [] lines = code.splitlines() for line_num, line in enumerate(lines, 1): match = re.search(pattern, line) if match: func_name = match.group(1) if match.groups() else match.group(0) # Simple complexity calculation (count control structures) complexity = 1 control_patterns = [r"\bif\b", r"\bfor\b", r"\bwhile\b", r"\btry\b"] for pattern in control_patterns: complexity += len(re.findall(pattern, line)) functions.append({ "name": func_name, "start_line": line_num, "complexity": complexity }) return functions def _extract_classes(self, code: str, pattern: str) -> List[Dict[str, Any]]: """Extract class information from code""" classes = [] lines = code.splitlines() for line_num, line in enumerate(lines, 1): match = re.search(pattern, line) if match: class_name = match.group(1) classes.append({ "name": class_name, "start_line": line_num }) return classes def _calculate_complexity(self, code: str, functions: List[Dict[str, Any]]) -> Dict[str, int]: """Calculate complexity metrics""" total_complexity = sum(func["complexity"] for func in functions) return { "cyclomatic_complexity": total_complexity, "function_count": len(functions), "average_complexity": int(total_complexity / len(functions)) if functions else 0 } def _get_security_suggestion(self, rule_id: str) -> str: """Get security improvement suggestions""" suggestions = { "hardcoded_secrets": "Use environment variables or secret management service", "sql_injection": "Use parameterized queries or prepared statements", "eval_usage": "Replace with safer alternatives or validate input strictly", "weak_crypto": "Use stronger algorithms like SHA-256 or bcrypt" } return suggestions.get(rule_id, "Review and fix the security issue") # Test implementation def test_multi_language_parser(): """Test the multi-language parser""" parser = MultiLanguageParser() # Test Python code python_code = """ def authenticate_user(username, password): # Hardcoded secret - bad practice secret_key = "hardcoded_secret_123" if username == "admin": return True return False class UserManager: def __init__(self): pass """ # Parse Python code result = parser.parse_code("test.py", python_code) assert result["language"] == "python" assert len(result["functions"]) == 1 assert len(result["classes"]) == 1 # Security analysis security_issues = parser.analyze_security("test.py", python_code) assert len(security_issues) > 0 assert any(issue.rule_id == "hardcoded_secrets" for issue in security_issues) # Quality analysis quality_issues = parser.analyze_quality("test.py", python_code) assert isinstance(quality_issues, list) print("✅ Multi-language parser tests passed") return True if __name__ == "__main__": test_multi_language_parser()