hollywoodfrancis commited on
Commit
6061012
·
verified ·
1 Parent(s): 7628384

Upload 6 files

Browse files
coding_expert/config/config.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for the Coding Expert model
3
+ """
4
+
5
+ # Core programming domains
6
+ CODING_DOMAINS = {
7
+ "programming_languages": {
8
+ "python": {
9
+ "level": "expert",
10
+ "focus": ["data structures", "algorithms", "web development", "machine learning"]
11
+ },
12
+ "javascript": {
13
+ "level": "expert",
14
+ "focus": ["frontend", "backend", "frameworks", "performance"]
15
+ },
16
+ "java": {
17
+ "level": "expert",
18
+ "focus": ["enterprise", "concurrency", "frameworks", "design patterns"]
19
+ },
20
+ "c++": {
21
+ "level": "expert",
22
+ "focus": ["systems", "performance", "templates", "memory management"]
23
+ },
24
+ "go": {
25
+ "level": "expert",
26
+ "focus": ["concurrency", "networking", "performance", "cloud"]
27
+ }
28
+ },
29
+ "frameworks": {
30
+ "web": {
31
+ "django": "expert",
32
+ "flask": "expert",
33
+ "fastapi": "expert",
34
+ "react": "expert",
35
+ "vue": "expert",
36
+ "angular": "expert"
37
+ },
38
+ "mobile": {
39
+ "flutter": "expert",
40
+ "react_native": "expert",
41
+ "swift": "expert",
42
+ "kotlin": "expert"
43
+ },
44
+ "cloud": {
45
+ "aws": "expert",
46
+ "gcp": "expert",
47
+ "azure": "expert",
48
+ "kubernetes": "expert"
49
+ }
50
+ },
51
+ "tools": {
52
+ "ci_cd": ["github_actions", "jenkins", "circleci", "gitlab_ci"],
53
+ "version_control": ["git", "mercurial"],
54
+ "package_management": ["pip", "npm", "maven", "gradle", "cargo"],
55
+ "ide": ["vscode", "pycharm", "intellij", "vim", "emacs"]
56
+ }
57
+ }
58
+
59
+ # Core coding tasks
60
+ CODING_TASKS = {
61
+ "problem_solving": {
62
+ "level": "expert",
63
+ "subtasks": [
64
+ "algorithm_design",
65
+ "data_structure_selection",
66
+ "complexity_analysis",
67
+ "optimization"
68
+ ]
69
+ },
70
+ "code_review": {
71
+ "level": "expert",
72
+ "subtasks": [
73
+ "architecture_review",
74
+ "security_review",
75
+ "performance_review",
76
+ "code_style_review"
77
+ ]
78
+ },
79
+ "debugging": {
80
+ "level": "expert",
81
+ "subtasks": [
82
+ "memory_leaks",
83
+ "race_conditions",
84
+ "performance_bottlenecks",
85
+ "concurrency_issues"
86
+ ]
87
+ },
88
+ "testing": {
89
+ "level": "expert",
90
+ "subtasks": [
91
+ "unit_testing",
92
+ "integration_testing",
93
+ "performance_testing",
94
+ "security_testing"
95
+ ]
96
+ },
97
+ "architecture_design": {
98
+ "level": "expert",
99
+ "subtasks": [
100
+ "microservices",
101
+ "distributed_systems",
102
+ "scalability",
103
+ "fault_tolerance"
104
+ ]
105
+ }
106
+ }
107
+
108
+ # Core datasets
109
+ CODING_DATASETS = {
110
+ "CodeSearchNet": {
111
+ "source": "codeium/codeium",
112
+ "split": "train",
113
+ "fields": ["code", "docstring", "language", "function_name"],
114
+ "description": "HuggingFace - multi-language code corpus",
115
+ "tasks": ["code_search", "code_completion", "documentation"]
116
+ },
117
+ "HumanEval": {
118
+ "source": "openai/human_eval",
119
+ "split": "test",
120
+ "fields": ["task_id", "prompt", "canonical_solution", "test", "entry_point"],
121
+ "description": "OpenAI's functional code evaluation dataset",
122
+ "tasks": ["code_generation", "function_implementation", "unit_testing"]
123
+ },
124
+ "MBPP": {
125
+ "source": "mbpp/mbpp",
126
+ "split": "train",
127
+ "fields": ["task_id", "text", "code", "test_list", "challenge_test_list"],
128
+ "description": "Mostly Basic Python Problems",
129
+ "tasks": ["problem_solving", "code_generation", "unit_testing"]
130
+ },
131
+ "Spider": {
132
+ "source": "yale-lily/spider",
133
+ "split": "train",
134
+ "fields": ["query", "question", "db_id", "sql"],
135
+ "description": "Text-to-SQL mapping",
136
+ "tasks": ["sql_generation", "text_to_sql", "database_queries"]
137
+ },
138
+ "DeepFix": {
139
+ "source": "deepfix/deepfix",
140
+ "split": "train",
141
+ "fields": ["code", "fixed_code", "error_type"],
142
+ "description": "Bug fixing dataset",
143
+ "tasks": ["bug_fixing", "error_detection", "code_correction"]
144
+ },
145
+ "CodeXGLUE": {
146
+ "source": "microsoft/CodeXGLUE",
147
+ "split": "train",
148
+ "fields": ["code", "docstring", "task", "language"],
149
+ "description": "Multitask code understanding/generation benchmark",
150
+ "tasks": ["code_translation", "code_summarization", "code_generation"]
151
+ }
152
+ }
153
+
154
+ # Print configuration summary
155
+ def print_config_summary():
156
+ print("\nCoding Expert Configuration Summary:")
157
+ print(f"Number of domains: {len(CODING_DOMAINS)}")
158
+ print(f"Number of languages: {len(CODING_DOMAINS['programming_languages'])}")
159
+ print(f"Number of tasks: {len(CODING_TASKS)}")
160
+ print(f"Number of datasets: {len(CODING_DATASETS)}")
161
+ print("\nDataset Details:")
162
+ for name, config in CODING_DATASETS.items():
163
+ print(f"\n{name}:")
164
+ print(f"Description: {config['description']}")
165
+ print(f"Tasks: {', '.join(config['tasks'])}")
166
+ print(f"Fields: {', '.join(config['fields'])}")
167
+
168
+ if __name__ == "__main__":
169
+ print_config_summary()
coding_expert/config/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.30.0
2
+ sympy>=1.11.1
3
+ torch>=2.0.0
4
+ numpy>=1.24.0
5
+ scipy>=1.10.0
6
+ pandas>=2.0.0
7
+ huggingface_hub>=0.16.0
8
+ jsonlines>=3.0.0
9
+ pyyaml>=5.4.1
10
+ datasets>=2.14.0
11
+ psutil>=5.9.0
12
+ astroid>=2.16.0
coding_expert/data/prepare_data.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data preparation script for the Coding Expert model
3
+ """
4
+ import os
5
+ import json
6
+ from pathlib import Path
7
+ import jsonlines
8
+ from typing import Dict, List, Any
9
+ import sys
10
+ import psutil
11
+ from datasets import load_dataset
12
+ import ast
13
+ import numpy as np
14
+
15
+ from data_processor import CodeDataProcessor
16
+
17
+ class CodeDataPreparer:
18
+ def __init__(self, output_dir: str = "processed_data"):
19
+ self.output_dir = Path(output_dir)
20
+ self.output_dir.mkdir(exist_ok=True)
21
+ self.datasets = {
22
+ "CodeSearchNet": {
23
+ "source": "codeium/codeium",
24
+ "split": "train",
25
+ "fields": ["code", "docstring", "language", "function_name"]
26
+ },
27
+ "HumanEval": {
28
+ "source": "openai/human_eval",
29
+ "split": "test",
30
+ "fields": ["task_id", "prompt", "canonical_solution", "test", "entry_point"]
31
+ },
32
+ "MBPP": {
33
+ "source": "mbpp/mbpp",
34
+ "split": "train",
35
+ "fields": ["task_id", "text", "code", "test_list", "challenge_test_list"]
36
+ },
37
+ "Spider": {
38
+ "source": "yale-lily/spider",
39
+ "split": "train",
40
+ "fields": ["query", "question", "db_id", "sql"]
41
+ },
42
+ "DeepFix": {
43
+ "source": "deepfix/deepfix",
44
+ "split": "train",
45
+ "fields": ["code", "fixed_code", "error_type"]
46
+ },
47
+ "CodeXGLUE": {
48
+ "source": "microsoft/CodeXGLUE",
49
+ "split": "train",
50
+ "fields": ["code", "docstring", "task", "language"]
51
+ }
52
+ }
53
+
54
+ def process_dataset(self, dataset: List[Dict[str, Any]], dataset_name: str) -> List[Dict[str, Any]]:
55
+ """Process a specific dataset"""
56
+ processed = []
57
+ error_count = 0
58
+
59
+ print(f"\nProcessing {dataset_name} dataset...")
60
+
61
+ for idx, example in enumerate(dataset):
62
+ try:
63
+ processed_example = self._process_example(dataset_name, example)
64
+ processed.append(processed_example)
65
+ except Exception as e:
66
+ print(f"Error processing example {idx} in {dataset_name}: {str(e)}")
67
+ error_count += 1
68
+
69
+ print(f"Processed {len(processed)} examples from {dataset_name}")
70
+ print(f"Encountered {error_count} errors during processing")
71
+ return processed
72
+
73
+ def _process_example(self, dataset_name: str, example: Dict[str, Any]) -> Dict[str, Any]:
74
+ """Process a single example based on its dataset type"""
75
+ if dataset_name == "CodeSearchNet":
76
+ return self._process_code_search_net(example)
77
+ elif dataset_name == "HumanEval":
78
+ return self._process_human_eval(example)
79
+ elif dataset_name == "MBPP":
80
+ return self._process_mbpp(example)
81
+ elif dataset_name == "Spider":
82
+ return self._process_spider(example)
83
+ elif dataset_name == "DeepFix":
84
+ return self._process_deep_fix(example)
85
+ elif dataset_name == "CodeXGLUE":
86
+ return self._process_codexglue(example)
87
+ else:
88
+ raise ValueError(f"Unknown dataset: {dataset_name}")
89
+
90
+ def _process_code_search_net(self, example: Dict[str, Any]) -> Dict[str, Any]:
91
+ """Process CodeSearchNet example"""
92
+ return {
93
+ "code": example["code"].strip(),
94
+ "docstring": example["docstring"].strip(),
95
+ "language": example["language"],
96
+ "function_name": example["function_name"],
97
+ "code_analysis": self._analyze_code(example["code"])
98
+ }
99
+
100
+ def _process_human_eval(self, example: Dict[str, Any]) -> Dict[str, Any]:
101
+ """Process HumanEval example"""
102
+ return {
103
+ "task_id": example["task_id"],
104
+ "prompt": example["prompt"].strip(),
105
+ "solution": example["canonical_solution"].strip(),
106
+ "test": example["test"].strip(),
107
+ "entry_point": example["entry_point"],
108
+ "code_analysis": self._analyze_code(example["canonical_solution"])
109
+ }
110
+
111
+ def _process_mbpp(self, example: Dict[str, Any]) -> Dict[str, Any]:
112
+ """Process MBPP example"""
113
+ return {
114
+ "task_id": example["task_id"],
115
+ "problem": example["text"].strip(),
116
+ "solution": example["code"].strip(),
117
+ "test_list": example["test_list"],
118
+ "challenge_test_list": example["challenge_test_list"],
119
+ "code_analysis": self._analyze_code(example["code"])
120
+ }
121
+
122
+ def _process_spider(self, example: Dict[str, Any]) -> Dict[str, Any]:
123
+ """Process Spider example"""
124
+ return {
125
+ "query": example["query"].strip(),
126
+ "question": example["question"].strip(),
127
+ "db_id": example["db_id"],
128
+ "sql": example["sql"].strip(),
129
+ "code_analysis": self._analyze_code(example["sql"])
130
+ }
131
+
132
+ def _process_deep_fix(self, example: Dict[str, Any]) -> Dict[str, Any]:
133
+ """Process DeepFix example"""
134
+ return {
135
+ "original_code": example["code"].strip(),
136
+ "fixed_code": example["fixed_code"].strip(),
137
+ "error_type": example["error_type"],
138
+ "code_analysis": self._analyze_code(example["fixed_code"])
139
+ }
140
+
141
+ def _process_codexglue(self, example: Dict[str, Any]) -> Dict[str, Any]:
142
+ """Process CodeXGLUE example"""
143
+ return {
144
+ "code": example["code"].strip(),
145
+ "docstring": example["docstring"].strip(),
146
+ "task": example["task"],
147
+ "language": example["language"],
148
+ "code_analysis": self._analyze_code(example["code"])
149
+ }
150
+
151
+ def _analyze_code(self, code: str) -> Dict[str, Any]:
152
+ """Analyze code structure and complexity"""
153
+ try:
154
+ tree = ast.parse(code)
155
+ return {
156
+ "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
157
+ "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
158
+ "complexity": self._calculate_complexity(tree)
159
+ }
160
+ except Exception as e:
161
+ return {"error": str(e)}
162
+
163
+ def _calculate_complexity(self, tree: ast.AST) -> int:
164
+ """Calculate cyclomatic complexity"""
165
+ complexity = 1 # Start with 1 for the main program
166
+ for node in ast.walk(tree):
167
+ if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
168
+ complexity += 1
169
+ return complexity
170
+
171
+ def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str):
172
+ """Save data to JSONL file"""
173
+ filepath = self.output_dir / filename
174
+ with jsonlines.open(filepath, mode='w') as writer:
175
+ writer.write_all(data)
176
+ return filepath
177
+
178
+ def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
179
+ """Print sample of processed data"""
180
+ print("\nSample data:")
181
+ for i, example in enumerate(data[:count]):
182
+ print(f"\nSample {i+1}:")
183
+ print(json.dumps(example, indent=2))
184
+
185
+ def print_memory_usage(self):
186
+ """Print current memory usage"""
187
+ process = psutil.Process()
188
+ memory_info = process.memory_info()
189
+ print(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
190
+
191
+ def main():
192
+ preparer = CodeDataPreparer()
193
+
194
+ # Process each dataset
195
+ for dataset_name, config in preparer.datasets.items():
196
+ try:
197
+ print(f"\nLoading {dataset_name} dataset...")
198
+ dataset = load_dataset(config["source"], split=config["split"])
199
+ print(f"Loaded {len(dataset)} samples from {dataset_name}")
200
+
201
+ processed_data = preparer.process_dataset(dataset, dataset_name)
202
+ print(f"Processed {len(processed_data)} samples")
203
+
204
+ preparer.print_sample(processed_data)
205
+
206
+ # Save processed data
207
+ output_path = preparer.save_to_jsonl(
208
+ processed_data,
209
+ f"{dataset_name.lower()}_processed.jsonl"
210
+ )
211
+ print(f"\nSaved {dataset_name} data to: {output_path}")
212
+
213
+ except Exception as e:
214
+ print(f"Error processing {dataset_name} dataset: {str(e)}")
215
+ print("Continuing with next dataset...")
216
+
217
+ # Print memory usage
218
+ preparer.print_memory_usage()
219
+
220
+ if __name__ == "__main__":
221
+ main()
coding_expert/expert.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .config import config
2
+ from .utils.data_processor import DataProcessor
3
+ from .tasks.validation import Validation
4
+
5
+ class CodingExpert:
6
+ def __init__(self):
7
+ self.config = config
8
+ self.data_processor = DataProcessor()
9
+ self.validator = Validation()
10
+
11
+ def process_data(self, input_data):
12
+ """Process coding-related data using the data processor."""
13
+ return self.data_processor.process(input_data)
14
+
15
+ def validate(self, code):
16
+ """Validate code using the validation system."""
17
+ return self.validator.validate(code)
18
+
19
+ def get_config(self):
20
+ """Return the current configuration."""
21
+ return self.config
coding_expert/tasks/validation.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Validation module for the Coding Expert model
3
+ """
4
+ import os
5
+ import json
6
+ from pathlib import Path
7
+ import hashlib
8
+ import datetime
9
+ from typing import Dict, Any, List, Optional
10
+ import subprocess
11
+ import ast
12
+ import sys
13
+ import psutil
14
+
15
+ class CodeValidator:
16
+ def __init__(self, checkpoint_dir: str = "checkpoints"):
17
+ self.checkpoint_dir = Path(checkpoint_dir)
18
+ self.checkpoint_dir.mkdir(exist_ok=True)
19
+ self.validation_dir = self.checkpoint_dir / "validation"
20
+ self.validation_dir.mkdir(exist_ok=True)
21
+
22
+ # Initialize validation metrics
23
+ self.metrics = {
24
+ "code_quality": [],
25
+ "performance": [],
26
+ "memory_usage": [],
27
+ "error_count": []
28
+ }
29
+
30
+ def validate_code(self, code: str, language: str = "python") -> Dict[str, Any]:
31
+ """Validate code quality and performance"""
32
+ try:
33
+ # Parse the code to check syntax
34
+ tree = ast.parse(code)
35
+
36
+ # Calculate code metrics
37
+ metrics = self._calculate_code_metrics(tree)
38
+
39
+ # Run static analysis
40
+ static_analysis = self._run_static_analysis(code, language)
41
+
42
+ # Check for common issues
43
+ issues = self._check_common_issues(tree)
44
+
45
+ return {
46
+ "is_valid": not issues,
47
+ "metrics": metrics,
48
+ "static_analysis": static_analysis,
49
+ "issues": issues,
50
+ "validation_score": self._calculate_validation_score(metrics, issues)
51
+ }
52
+ except Exception as e:
53
+ return {
54
+ "is_valid": False,
55
+ "error": str(e),
56
+ "validation_score": 0.0
57
+ }
58
+
59
+ def _calculate_code_metrics(self, tree: ast.AST) -> Dict[str, Any]:
60
+ """Calculate various code metrics"""
61
+ return {
62
+ "complexity": self._calculate_complexity(tree),
63
+ "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
64
+ "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
65
+ "num_imports": len([node for node in ast.walk(tree) if isinstance(node, ast.Import)]),
66
+ "num_statements": len([node for node in ast.walk(tree) if isinstance(node, ast.stmt)])
67
+ }
68
+
69
+ def _calculate_complexity(self, tree: ast.AST) -> int:
70
+ """Calculate cyclomatic complexity"""
71
+ complexity = 1 # Start with 1 for the main program
72
+ for node in ast.walk(tree):
73
+ if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
74
+ complexity += 1
75
+ return complexity
76
+
77
+ def _run_static_analysis(self, code: str, language: str) -> Dict[str, Any]:
78
+ """Run static analysis tools"""
79
+ if language == "python":
80
+ try:
81
+ # Run pylint
82
+ process = subprocess.run(
83
+ ["pylint", "-"],
84
+ input=code,
85
+ capture_output=True,
86
+ text=True,
87
+ timeout=5
88
+ )
89
+ score = float(process.stdout.split("Your code has been rated at")[1].split()[0])
90
+ return {
91
+ "pylint_score": score,
92
+ "issues": process.stdout.count("error")
93
+ }
94
+ except Exception as e:
95
+ return {
96
+ "pylint_score": 0.0,
97
+ "error": str(e)
98
+ }
99
+ return {}
100
+
101
+ def _check_common_issues(self, tree: ast.AST) -> List[str]:
102
+ """Check for common code issues"""
103
+ issues = []
104
+
105
+ # Check for global variables
106
+ for node in ast.walk(tree):
107
+ if isinstance(node, ast.Global):
108
+ issues.append("Global variables detected")
109
+
110
+ # Check for long functions
111
+ for node in ast.walk(tree):
112
+ if isinstance(node, ast.FunctionDef):
113
+ if len(node.body) > 50:
114
+ issues.append(f"Function {node.name} is too long")
115
+
116
+ # Check for complex if statements
117
+ for node in ast.walk(tree):
118
+ if isinstance(node, ast.If):
119
+ if len(node.body) > 20:
120
+ issues.append("Complex if statement detected")
121
+
122
+ return issues
123
+
124
+ def _calculate_validation_score(self, metrics: Dict[str, Any], issues: List[str]) -> float:
125
+ """Calculate overall validation score"""
126
+ score = 1.0
127
+
128
+ # Penalize for code complexity
129
+ score *= 0.9 if metrics["complexity"] > 10 else 1.0
130
+
131
+ # Penalize for issues
132
+ score *= 0.9 ** len(issues)
133
+
134
+ return max(0.0, min(1.0, score))
135
+
136
+ def create_checkpoint(self, data: Dict[str, Any], name: str = None) -> str:
137
+ """Create a checkpoint of validation data"""
138
+ if name is None:
139
+ name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
140
+
141
+ checkpoint_path = self.validation_dir / f"checkpoint_{name}.json"
142
+
143
+ # Add timestamp and hash
144
+ data["timestamp"] = str(datetime.datetime.now())
145
+ data["hash"] = hashlib.sha256(str(data).encode()).hexdigest()
146
+
147
+ with open(checkpoint_path, 'w') as f:
148
+ json.dump(data, f, indent=2)
149
+
150
+ return str(checkpoint_path)
151
+
152
+ def load_checkpoint(self, name: str) -> Optional[Dict[str, Any]]:
153
+ """Load a validation checkpoint"""
154
+ checkpoint_path = self.validation_dir / f"checkpoint_{name}.json"
155
+ if not checkpoint_path.exists():
156
+ return None
157
+
158
+ with open(checkpoint_path, 'r') as f:
159
+ return json.load(f)
160
+
161
+ def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
162
+ """Validate a complete dataset"""
163
+ results = []
164
+ error_count = 0
165
+
166
+ for idx, example in enumerate(dataset):
167
+ try:
168
+ # Validate code
169
+ if "code" in example:
170
+ code_result = self.validate_code(
171
+ example["code"],
172
+ example.get("language", "python")
173
+ )
174
+ results.append(code_result)
175
+
176
+ # Validate code review
177
+ if "review" in example:
178
+ review_result = self._validate_code_review(
179
+ example["code"],
180
+ example["review"]
181
+ )
182
+ results.append(review_result)
183
+ except Exception as e:
184
+ error_count += 1
185
+ results.append({
186
+ "error": str(e),
187
+ "validation_score": 0.0
188
+ })
189
+
190
+ # Calculate overall metrics
191
+ scores = [r["validation_score"] for r in results if "validation_score" in r]
192
+ if scores:
193
+ avg_score = np.mean(scores)
194
+ else:
195
+ avg_score = 0.0
196
+
197
+ return {
198
+ "total_examples": len(dataset),
199
+ "processed_examples": len(results),
200
+ "error_count": error_count,
201
+ "average_score": float(avg_score),
202
+ "detailed_results": results
203
+ }
204
+
205
+ def save_validation_report(self, report: Dict[str, Any], name: str = None) -> str:
206
+ """Save a validation report"""
207
+ if name is None:
208
+ name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
209
+
210
+ report_path = self.validation_dir / f"report_{name}.json"
211
+
212
+ # Add timestamp and summary metrics
213
+ report["timestamp"] = str(datetime.datetime.now())
214
+ report["summary"] = {
215
+ "accuracy": report.get("average_score", 0.0),
216
+ "error_rate": report.get("error_count", 0) / report.get("total_examples", 1)
217
+ }
218
+
219
+ with open(report_path, 'w') as f:
220
+ json.dump(report, f, indent=2)
221
+
222
+ return str(report_path)
223
+
224
+ def _validate_code_review(self, code: str, review: str) -> Dict[str, Any]:
225
+ """Validate code review comments"""
226
+ try:
227
+ # Validate code
228
+ code_result = self.validate_code(code)
229
+
230
+ # Check if review addresses key issues
231
+ issues = self._check_common_issues(ast.parse(code))
232
+ review_issues = [issue for issue in issues if issue.lower() in review.lower()]
233
+
234
+ return {
235
+ "is_valid": len(review_issues) > 0,
236
+ "review_issues_covered": len(review_issues),
237
+ "total_issues": len(issues),
238
+ "validation_score": code_result["validation_score"] * (len(review_issues) / len(issues) if issues else 1.0)
239
+ }
240
+ except Exception as e:
241
+ return {
242
+ "is_valid": False,
243
+ "error": str(e),
244
+ "validation_score": 0.0
245
+ }
coding_expert/utils/data_processor.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data processing utilities for the Coding Expert model
3
+ """
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import jsonlines
8
+ from typing import Dict, List, Any, Optional, Tuple
9
+ import hashlib
10
+ import datetime
11
+ import logging
12
+ import numpy as np
13
+ import pandas as pd
14
+ from datasets import Dataset
15
+ from tqdm import tqdm
16
+ import ast
17
+ import re
18
+ from collections import Counter
19
+
20
+ class CodeDataProcessor:
21
+ def __init__(self, output_dir: str = "processed_data"):
22
+ self.output_dir = Path(output_dir)
23
+ self.output_dir.mkdir(exist_ok=True)
24
+ self.logger = self._setup_logger()
25
+
26
+ def _setup_logger(self) -> logging.Logger:
27
+ """Setup logging specific to code processing"""
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.INFO)
30
+ handler = logging.StreamHandler()
31
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
32
+ handler.setFormatter(formatter)
33
+ logger.addHandler(handler)
34
+ return logger
35
+
36
+ def process_code(self, code: str, language: str = "python") -> Dict[str, Any]:
37
+ """Process and analyze code snippet"""
38
+ try:
39
+ # Basic cleaning
40
+ code = self._clean_code(code)
41
+
42
+ # Parse AST if possible
43
+ ast_info = self._parse_ast(code, language)
44
+
45
+ # Extract code metrics
46
+ metrics = self._extract_code_metrics(code, ast_info)
47
+
48
+ # Identify patterns and anti-patterns
49
+ patterns = self._identify_patterns(code)
50
+
51
+ return {
52
+ "code": code,
53
+ "language": language,
54
+ "ast_info": ast_info,
55
+ "metrics": metrics,
56
+ "patterns": patterns
57
+ }
58
+ except Exception as e:
59
+ self.logger.warning(f"Error processing code: {str(e)}")
60
+ return {"error": str(e)}
61
+
62
+ def _clean_code(self, code: str) -> str:
63
+ """Clean code by removing unnecessary whitespace and comments"""
64
+ # Remove trailing whitespace
65
+ code = code.strip()
66
+
67
+ # Remove empty lines
68
+ lines = [line.strip() for line in code.split('\n') if line.strip()]
69
+ code = '\n'.join(lines)
70
+
71
+ return code
72
+
73
+ def _parse_ast(self, code: str, language: str) -> Dict[str, Any]:
74
+ """Parse code into AST and extract structure"""
75
+ try:
76
+ if language == "python":
77
+ tree = ast.parse(code)
78
+ return {
79
+ "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
80
+ "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
81
+ "complexity": self._calculate_complexity(tree)
82
+ }
83
+ return {}
84
+ except Exception as e:
85
+ return {"error": str(e)}
86
+
87
+ def _calculate_complexity(self, tree: ast.AST) -> int:
88
+ """Calculate cyclomatic complexity"""
89
+ complexity = 1 # Start with 1 for the main program
90
+ for node in ast.walk(tree):
91
+ if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
92
+ complexity += 1
93
+ return complexity
94
+
95
+ def _extract_code_metrics(self, code: str, ast_info: Dict[str, Any]) -> Dict[str, Any]:
96
+ """Extract various code metrics"""
97
+ metrics = {
98
+ "length": len(code),
99
+ "lines": len(code.split('\n')),
100
+ "tokens": len(code.split()),
101
+ "unique_tokens": len(set(code.split())),
102
+ "ast_complexity": ast_info.get("complexity", 0),
103
+ "function_count": ast_info.get("num_functions", 0),
104
+ "class_count": ast_info.get("num_classes", 0)
105
+ }
106
+
107
+ # Calculate token distribution
108
+ tokens = code.split()
109
+ token_dist = Counter(tokens)
110
+ metrics["token_distribution"] = token_dist.most_common(5)
111
+
112
+ return metrics
113
+
114
+ def _identify_patterns(self, code: str) -> Dict[str, List[str]]:
115
+ """Identify common code patterns and anti-patterns"""
116
+ patterns = {
117
+ "design_patterns": [],
118
+ "anti_patterns": [],
119
+ "security_issues": []
120
+ }
121
+
122
+ # Look for common design patterns
123
+ if "class" in code and "def" in code:
124
+ patterns["design_patterns"].append("Class-based design")
125
+
126
+ # Look for anti-patterns
127
+ if "global" in code:
128
+ patterns["anti_patterns"].append("Global variables")
129
+
130
+ # Look for security issues
131
+ if "eval(" in code:
132
+ patterns["security_issues"].append("Eval usage")
133
+
134
+ return patterns
135
+
136
+ def process_dataset(self, dataset: Dataset, dataset_name: str) -> List[Dict[str, Any]]:
137
+ """Process a complete dataset"""
138
+ processed = []
139
+ error_count = 0
140
+
141
+ self.logger.info(f"Processing {dataset_name} dataset with {len(dataset)} samples")
142
+
143
+ for idx, example in enumerate(tqdm(dataset, desc=f"Processing {dataset_name}")):
144
+ try:
145
+ processed_example = self._process_example(example, dataset_name)
146
+ processed.append(processed_example)
147
+ except Exception as e:
148
+ error_count += 1
149
+ self.logger.error(f"Error processing example {idx} in {dataset_name}: {str(e)}")
150
+
151
+ self.logger.info(f"Processed {len(processed)} examples")
152
+ self.logger.info(f"Encountered {error_count} errors")
153
+
154
+ return processed
155
+
156
+ def _process_example(self, example: Dict[str, Any], dataset_name: str) -> Dict[str, Any]:
157
+ """Process a single example based on dataset type"""
158
+ if dataset_name == "CodeSearchNet":
159
+ return self._process_code_search_net(example)
160
+ elif dataset_name == "HumanEval":
161
+ return self._process_human_eval(example)
162
+ elif dataset_name == "MBPP":
163
+ return self._process_mbpp(example)
164
+ elif dataset_name == "Spider":
165
+ return self._process_spider(example)
166
+ elif dataset_name == "DeepFix":
167
+ return self._process_deep_fix(example)
168
+ elif dataset_name == "CodeXGLUE":
169
+ return self._process_codexglue(example)
170
+ else:
171
+ raise ValueError(f"Unknown dataset: {dataset_name}")
172
+
173
+ def _process_code_search_net(self, example: Dict[str, Any]) -> Dict[str, Any]:
174
+ """Process CodeSearchNet example"""
175
+ return {
176
+ "code": example["code"].strip(),
177
+ "docstring": example["docstring"].strip(),
178
+ "language": example["language"],
179
+ "function_name": example["function_name"],
180
+ "code_analysis": self.process_code(example["code"]) # Reuse code processing
181
+ }
182
+
183
+ def _process_human_eval(self, example: Dict[str, Any]) -> Dict[str, Any]:
184
+ """Process HumanEval example"""
185
+ return {
186
+ "task_id": example["task_id"],
187
+ "prompt": example["prompt"].strip(),
188
+ "solution": example["canonical_solution"].strip(),
189
+ "test": example["test"].strip(),
190
+ "entry_point": example["entry_point"],
191
+ "code_analysis": self.process_code(example["canonical_solution"]) # Reuse code processing
192
+ }
193
+
194
+ def _process_mbpp(self, example: Dict[str, Any]) -> Dict[str, Any]:
195
+ """Process MBPP example"""
196
+ return {
197
+ "task_id": example["task_id"],
198
+ "problem": example["text"].strip(),
199
+ "solution": example["code"].strip(),
200
+ "test_list": example["test_list"],
201
+ "challenge_test_list": example["challenge_test_list"],
202
+ "code_analysis": self.process_code(example["code"]) # Reuse code processing
203
+ }
204
+
205
+ def _process_spider(self, example: Dict[str, Any]) -> Dict[str, Any]:
206
+ """Process Spider example"""
207
+ return {
208
+ "query": example["query"].strip(),
209
+ "question": example["question"].strip(),
210
+ "db_id": example["db_id"],
211
+ "sql": example["sql"].strip(),
212
+ "code_analysis": self.process_code(example["sql"]) # Reuse code processing
213
+ }
214
+
215
+ def _process_deep_fix(self, example: Dict[str, Any]) -> Dict[str, Any]:
216
+ """Process DeepFix example"""
217
+ return {
218
+ "original_code": example["code"].strip(),
219
+ "fixed_code": example["fixed_code"].strip(),
220
+ "error_type": example["error_type"],
221
+ "code_analysis": self.process_code(example["fixed_code"]) # Reuse code processing
222
+ }
223
+
224
+ def _process_codexglue(self, example: Dict[str, Any]) -> Dict[str, Any]:
225
+ """Process CodeXGLUE example"""
226
+ return {
227
+ "code": example["code"].strip(),
228
+ "docstring": example["docstring"].strip(),
229
+ "task": example["task"],
230
+ "language": example["language"],
231
+ "code_analysis": self.process_code(example["code"]) # Reuse code processing
232
+ }
233
+
234
+ def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str) -> Path:
235
+ """Save processed data to JSONL file"""
236
+ filepath = self.output_dir / filename
237
+ with jsonlines.open(filepath, mode='w') as writer:
238
+ writer.write_all(data)
239
+ self.logger.info(f"Saved data to {filepath}")
240
+ return filepath
241
+
242
+ def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
243
+ """Print sample of processed data"""
244
+ self.logger.info("\nSample data:")
245
+ for i, example in enumerate(data[:count]):
246
+ self.logger.info(f"\nSample {i+1}:")
247
+ self.logger.info(json.dumps(example, indent=2))
248
+
249
+ def print_memory_usage(self):
250
+ """Print current memory usage"""
251
+ process = psutil.Process()
252
+ memory_info = process.memory_info()
253
+ self.logger.info(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")