-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
113 lines (91 loc) · 3.35 KB
/
config.py
File metadata and controls
113 lines (91 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Application configuration with environment-aware settings."""
import os
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
from pathlib import Path
class Environment(Enum):
"""Application environment modes."""
DEVELOPMENT = "development"
PRODUCTION = "production"
TESTING = "testing"
class EmbeddingModel(Enum):
"""Supported embedding models for ChromaDB."""
OPENAI_SMALL = "text-embedding-3-small"
OPENAI_LARGE = "text-embedding-3-large"
DEFAULT = "default"
@dataclass
class ChromaConfig:
"""Configuration for ChromaDB connection."""
persist_directory: str = "./chroma_data"
default_collection: str = "code_collection"
embedding_model: EmbeddingModel = EmbeddingModel.OPENAI_SMALL
batch_size: int = 100
max_results: int = 20
def __post_init__(self):
"""Ensure persist directory exists."""
Path(self.persist_directory).mkdir(parents=True, exist_ok=True)
@dataclass
class SearchConfig:
"""Configuration for search behavior."""
default_n_results: int = 10
max_n_results: int = 50
min_query_length: int = 2
max_query_length: int = 500
score_precision: int = 4
regex_max_results: int = 100
regex_timeout_seconds: float = 5.0
@dataclass
class IngestionConfig:
"""Configuration for the code ingestion pipeline."""
max_tokens_per_chunk: int = 1000
supported_extensions: tuple = (".py",)
ignore_patterns: tuple = ("__pycache__", ".git", ".env", "node_modules")
batch_size: int = 100
tokenizer_model: str = "text-embedding-3-small"
fallback_encoding: str = "cl100k_base"
@dataclass
class ExportConfig:
"""Configuration for the export service."""
default_format: str = "json"
supported_formats: tuple = ("json", "csv")
max_export_chunks: int = 10000
@dataclass
class DiffConfig:
"""Configuration for the collection diff service."""
similarity_threshold: float = 0.98
max_diff_results: int = 50
include_modified_by_default: bool = True
@dataclass
class AppConfig:
"""Root application configuration combining all sub-configs."""
environment: Environment = field(default_factory=lambda: Environment(
os.getenv("FLASK_ENV", "development")
))
secret_key: str = field(default_factory=lambda:
os.getenv("SECRET_KEY", "dev-secret-key-change-in-production")
)
host: str = "0.0.0.0"
port: int = 5000
debug: bool = field(init=False)
openai_api_key: Optional[str] = field(default_factory=lambda:
os.getenv("OPENAI_API_KEY")
)
chroma: ChromaConfig = field(default_factory=ChromaConfig)
search: SearchConfig = field(default_factory=SearchConfig)
ingestion: IngestionConfig = field(default_factory=IngestionConfig)
export: ExportConfig = field(default_factory=ExportConfig)
diff: DiffConfig = field(default_factory=DiffConfig)
def __post_init__(self):
self.debug = self.environment == Environment.DEVELOPMENT
@classmethod
def from_environment(cls) -> "AppConfig":
"""Factory: build config from environment variables."""
return cls(
chroma=ChromaConfig(
persist_directory=os.getenv("CHROMA_PERSIST_DIR", "./chroma_data"),
),
)
def get_config() -> AppConfig:
"""Get the current application configuration."""
return AppConfig.from_environment()