nivakaran commited on
Commit
c7d4394
·
verified ·
1 Parent(s): ac649ea

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.template +46 -0
  2. .gitignore +52 -8
  3. models/anomaly-detection/.dockerignore +8 -0
  4. models/anomaly-detection/.gitignore +11 -0
  5. models/anomaly-detection/Dockerfile +1 -0
  6. models/anomaly-detection/README.md +45 -0
  7. models/anomaly-detection/artifacts/data_ingestion/ingested_data_20251208_152330.parquet +3 -0
  8. models/anomaly-detection/artifacts/data_transformation/embeddings_20251208_152444.npy +3 -0
  9. models/anomaly-detection/artifacts/data_transformation/features_20251208_152444.npy +3 -0
  10. models/anomaly-detection/artifacts/data_transformation/transformed_data_20251208_152444.parquet +3 -0
  11. models/anomaly-detection/artifacts/data_validation/validated_data_20251208_152332.parquet +3 -0
  12. models/anomaly-detection/artifacts/data_validation/validation_report_20251208_152332.yaml +7 -0
  13. models/anomaly-detection/dags/.airflowignore +0 -0
  14. models/anomaly-detection/dags/exampledag.py +100 -0
  15. models/anomaly-detection/dags/train_anomaly_model.py +241 -0
  16. models/anomaly-detection/data_schema/schema.yaml +133 -0
  17. models/anomaly-detection/download_models.py +86 -0
  18. models/anomaly-detection/main.py +85 -0
  19. models/anomaly-detection/packages.txt +0 -0
  20. models/anomaly-detection/requirements.txt +37 -0
  21. models/anomaly-detection/src/__init__.py +18 -0
  22. models/anomaly-detection/src/components/__init__.py +29 -0
  23. models/anomaly-detection/src/components/data_ingestion.py +247 -0
  24. models/anomaly-detection/src/components/data_transformation.py +458 -0
  25. models/anomaly-detection/src/components/data_validation.py +261 -0
  26. models/anomaly-detection/src/components/model_trainer.py +478 -0
  27. models/anomaly-detection/src/entity/__init__.py +30 -0
  28. models/anomaly-detection/src/entity/artifact_entity.py +79 -0
  29. models/anomaly-detection/src/entity/config_entity.py +109 -0
  30. models/anomaly-detection/src/pipeline/__init__.py +6 -0
  31. models/anomaly-detection/src/pipeline/training_pipeline.py +162 -0
  32. models/anomaly-detection/src/utils/__init__.py +24 -0
  33. models/anomaly-detection/src/utils/language_detector.py +209 -0
  34. models/anomaly-detection/src/utils/metrics.py +256 -0
  35. models/anomaly-detection/src/utils/vectorizer.py +243 -0
  36. models/anomaly-detection/tests/dags/test_dag_example.py +83 -0
  37. models/currency-volatility-prediction/.github/workflows/main.yaml +0 -0
  38. models/currency-volatility-prediction/.gitignore +13 -0
  39. models/currency-volatility-prediction/.python-version +1 -0
  40. models/currency-volatility-prediction/Dockerfile +8 -0
  41. models/currency-volatility-prediction/README.md +0 -0
  42. models/currency-volatility-prediction/app.py +0 -0
  43. models/currency-volatility-prediction/dags/currency_prediction_dag.py +212 -0
  44. models/currency-volatility-prediction/data_schema/schema.yaml +0 -0
  45. models/currency-volatility-prediction/main.py +179 -0
  46. models/currency-volatility-prediction/pyproject.toml +20 -0
  47. models/currency-volatility-prediction/requirements.txt +15 -0
  48. models/currency-volatility-prediction/setup.py +47 -0
  49. models/currency-volatility-prediction/src/__init__.py +21 -0
  50. models/currency-volatility-prediction/src/components/__init__.py +0 -0
.env.template ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ModelX Intelligence Platform - Production Configuration
2
+ # Copy this to .env and fill in your values
3
+
4
+ # =============================================================================
5
+ # LLM CONFIGURATION
6
+ # =============================================================================
7
+ # Get your free API key from https://console.groq.com
8
+ GROQ_API_KEY=your_groq_api_key_here
9
+
10
+ # =============================================================================
11
+ # STORAGE CONFIGURATION
12
+ # =============================================================================
13
+
14
+ # SQLite Cache
15
+ SQLITE_DB_PATH=data/cache/feeds.db
16
+ SQLITE_RETENTION_HOURS=24
17
+
18
+ # ChromaDB (Semantic Search)
19
+ CHROMADB_PATH=data/chromadb
20
+ CHROMADB_COLLECTION=modelx_feeds
21
+ CHROMADB_SIMILARITY_THRESHOLD=0.85
22
+ CHROMADB_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
23
+
24
+ # Neo4j Knowledge Graph
25
+ NEO4J_ENABLED=false # Set to 'true' to enable (requires Docker)
26
+ NEO4J_URI=bolt://localhost:7687
27
+ NEO4J_USER=neo4j
28
+ NEO4J_PASSWORD=modelx2024
29
+
30
+ # CSV Export
31
+ CSV_EXPORT_DIR=data/feeds
32
+
33
+ # Deduplication
34
+ EXACT_MATCH_CHARS=120
35
+
36
+ # =============================================================================
37
+ # API CONFIGURATION
38
+ # =============================================================================
39
+ API_HOST=0.0.0.0
40
+ API_PORT=8000
41
+ API_WORKERS=1
42
+
43
+ # =============================================================================
44
+ # FRONTEND CONFIGURATION
45
+ # =============================================================================
46
+ NEXT_PUBLIC_API_URL=http://localhost:8000
.gitignore CHANGED
@@ -9,15 +9,59 @@ wheels/
9
  # Virtual environments
10
  .venv
11
 
12
- .env
13
- .env.template
14
 
 
 
15
 
16
- #Data & Models (Un-ignored for Hackathon Demo persistence)
17
- models/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  data/
19
  datasets/
20
- data
21
- datasets
22
- models
23
- .langgraph_api
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Virtual environments
10
  .venv
11
 
12
+ # Environment files
13
+ .env
14
 
15
+ # LangGraph
16
+ .langgraph_api
17
 
18
+ # =============================================================================
19
+ # ML MODEL ARTIFACTS (Large files - don't push to Git)
20
+ # =============================================================================
21
+
22
+ # Trained model weights (large binary files)
23
+ *.h5
24
+ *.hdf5
25
+ *.joblib
26
+ *.pkl
27
+ *.pickle
28
+ *.pt
29
+ *.pth
30
+ *.onnx
31
+ *.pb
32
+
33
+ # Model output directories
34
+ models/*/artifacts/models/
35
+ models/*/output/
36
+ models/*/models_cache/
37
+ models/*/checkpoints/
38
+
39
+ # Airflow local state
40
+ models/*/.astro/
41
+
42
+ # MLflow artifacts (tracked separately)
43
+ mlruns/
44
+ mlartifacts/
45
+
46
+ # =============================================================================
47
+ # DATA FILES (Can be large)
48
+ # =============================================================================
49
  data/
50
  datasets/
51
+
52
+ # Database files
53
+ *.db
54
+ *.sqlite
55
+ *.sqlite3
56
+
57
+ # ChromaDB persistence (can be large)
58
+ chroma_db/
59
+
60
+ # =============================================================================
61
+ # KEEP THESE (source code, configs)
62
+ # =============================================================================
63
+ # The models/ folders themselves ARE tracked for:
64
+ # - main.py, src/, dags/ (pipeline code)
65
+ # - requirements.txt, setup.py (dependencies)
66
+ # - data_schema/ (validation configs)
67
+ # - README.md (documentation)
models/anomaly-detection/.dockerignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ astro
2
+ .git
3
+ .env
4
+ airflow_settings.yaml
5
+ logs/
6
+ .venv
7
+ airflow.db
8
+ airflow.cfg
models/anomaly-detection/.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .env
3
+ .DS_Store
4
+ airflow_settings.yaml
5
+ __pycache__/
6
+ astro
7
+ .venv
8
+ airflow-webserver.pid
9
+ webserver_config.py
10
+ airflow.cfg
11
+ airflow.db
models/anomaly-detection/Dockerfile ADDED
@@ -0,0 +1 @@
 
 
1
+ FROM astrocrpublic.azurecr.io/runtime:3.1-7
models/anomaly-detection/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Overview
2
+ ========
3
+
4
+ Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine.
5
+
6
+ Project Contents
7
+ ================
8
+
9
+ Your Astro project contains the following files and folders:
10
+
11
+ - dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes one example DAG:
12
+ - `example_astronauts`: This DAG shows a simple ETL pipeline example that queries the list of astronauts currently in space from the Open Notify API and prints a statement for each astronaut. The DAG uses the TaskFlow API to define tasks in Python, and dynamic task mapping to dynamically print a statement for each astronaut. For more on how this DAG works, see our [Getting started tutorial](https://www.astronomer.io/docs/learn/get-started-with-airflow).
13
+ - Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here.
14
+ - include: This folder contains any additional files that you want to include as part of your project. It is empty by default.
15
+ - packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default.
16
+ - requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default.
17
+ - plugins: Add custom or community plugins for your project to this file. It is empty by default.
18
+ - airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project.
19
+
20
+ Deploy Your Project Locally
21
+ ===========================
22
+
23
+ Start Airflow on your local machine by running 'astro dev start'.
24
+
25
+ This command will spin up five Docker containers on your machine, each for a different Airflow component:
26
+
27
+ - Postgres: Airflow's Metadata Database
28
+ - Scheduler: The Airflow component responsible for monitoring and triggering tasks
29
+ - DAG Processor: The Airflow component responsible for parsing DAGs
30
+ - API Server: The Airflow component responsible for serving the Airflow UI and API
31
+ - Triggerer: The Airflow component responsible for triggering deferred tasks
32
+
33
+ When all five containers are ready the command will open the browser to the Airflow UI at http://localhost:8080/. You should also be able to access your Postgres Database at 'localhost:5432/postgres' with username 'postgres' and password 'postgres'.
34
+
35
+ Note: If you already have either of the above ports allocated, you can either [stop your existing Docker containers or change the port](https://www.astronomer.io/docs/astro/cli/troubleshoot-locally#ports-are-not-available-for-my-local-airflow-webserver).
36
+
37
+ Deploy Your Project to Astronomer
38
+ =================================
39
+
40
+ If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation: https://www.astronomer.io/docs/astro/deploy-code/
41
+
42
+ Contact
43
+ =======
44
+
45
+ The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support.
models/anomaly-detection/artifacts/data_ingestion/ingested_data_20251208_152330.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3273d845dda217a76cae1c63e9e181fad07c212ed7bbbce008ceaa380012e586
3
+ size 104390
models/anomaly-detection/artifacts/data_transformation/embeddings_20251208_152444.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f476c6f7aee3c0d1acb74e054dd4a3b5c47170b8f476f5f273a9a87c106d4d14
3
+ size 586880
models/anomaly-detection/artifacts/data_transformation/features_20251208_152444.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad8dcf39385d403aca32cae883fa2030055bef9d0e5ee589bd86799adad69e3
3
+ size 1185856
models/anomaly-detection/artifacts/data_transformation/transformed_data_20251208_152444.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42202d32a4c0a02477aa9b90c902a2dd0ce057dc66562a1629198636d5431f80
3
+ size 112761
models/anomaly-detection/artifacts/data_validation/validated_data_20251208_152332.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3273d845dda217a76cae1c63e9e181fad07c212ed7bbbce008ceaa380012e586
3
+ size 104390
models/anomaly-detection/artifacts/data_validation/validation_report_20251208_152332.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ errors: []
2
+ input_path: C:\Users\LENOVO\Desktop\ModelX-Ultimate\models\anomaly-detection\artifacts\data_ingestion\ingested_data_20251208_152330.parquet
3
+ invalid_records: 0
4
+ total_records: 191
5
+ valid_records: 191
6
+ validation_status: true
7
+ validation_timestamp: '20251208_152332'
models/anomaly-detection/dags/.airflowignore ADDED
File without changes
models/anomaly-detection/dags/exampledag.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ## Astronaut ETL example DAG
3
+
4
+ This DAG queries the list of astronauts currently in space from the
5
+ Open Notify API and prints each astronaut's name and flying craft.
6
+
7
+ There are two tasks, one to get the data from the API and save the results,
8
+ and another to print the results. Both tasks are written in Python using
9
+ Airflow's TaskFlow API, which allows you to easily turn Python functions into
10
+ Airflow tasks, and automatically infer dependencies and pass data.
11
+
12
+ The second task uses dynamic task mapping to create a copy of the task for
13
+ each Astronaut in the list retrieved from the API. This list will change
14
+ depending on how many Astronauts are in space, and the DAG will adjust
15
+ accordingly each time it runs.
16
+
17
+ For more explanation and getting started instructions, see our Write your
18
+ first DAG tutorial: https://www.astronomer.io/docs/learn/get-started-with-airflow
19
+
20
+ ![Picture of the ISS](https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2010/02/space_station_over_earth/10293696-3-eng-GB/Space_Station_over_Earth_card_full.jpg)
21
+ """
22
+
23
+ from airflow.sdk.definitions.asset import Asset
24
+ from airflow.decorators import dag, task
25
+ from pendulum import datetime
26
+ import requests
27
+
28
+
29
+ # Define the basic parameters of the DAG, like schedule and start_date
30
+ @dag(
31
+ start_date=datetime(2024, 1, 1),
32
+ schedule="@daily",
33
+ catchup=False,
34
+ doc_md=__doc__,
35
+ default_args={"owner": "Astro", "retries": 3},
36
+ tags=["example"],
37
+ )
38
+ def example_astronauts():
39
+ # Define tasks
40
+ @task(
41
+ # Define a dataset outlet for the task. This can be used to schedule downstream DAGs when this task has run.
42
+ outlets=[Asset("current_astronauts")]
43
+ ) # Define that this task updates the `current_astronauts` Dataset
44
+ def get_astronauts(**context) -> list[dict]:
45
+ """
46
+ This task uses the requests library to retrieve a list of Astronauts
47
+ currently in space. The results are pushed to XCom with a specific key
48
+ so they can be used in a downstream pipeline. The task returns a list
49
+ of Astronauts to be used in the next task.
50
+ """
51
+ try:
52
+ r = requests.get("http://api.open-notify.org/astros.json")
53
+ r.raise_for_status()
54
+ number_of_people_in_space = r.json()["number"]
55
+ list_of_people_in_space = r.json()["people"]
56
+ except Exception:
57
+ print("API currently not available, using hardcoded data instead.")
58
+ number_of_people_in_space = 12
59
+ list_of_people_in_space = [
60
+ {"craft": "ISS", "name": "Oleg Kononenko"},
61
+ {"craft": "ISS", "name": "Nikolai Chub"},
62
+ {"craft": "ISS", "name": "Tracy Caldwell Dyson"},
63
+ {"craft": "ISS", "name": "Matthew Dominick"},
64
+ {"craft": "ISS", "name": "Michael Barratt"},
65
+ {"craft": "ISS", "name": "Jeanette Epps"},
66
+ {"craft": "ISS", "name": "Alexander Grebenkin"},
67
+ {"craft": "ISS", "name": "Butch Wilmore"},
68
+ {"craft": "ISS", "name": "Sunita Williams"},
69
+ {"craft": "Tiangong", "name": "Li Guangsu"},
70
+ {"craft": "Tiangong", "name": "Li Cong"},
71
+ {"craft": "Tiangong", "name": "Ye Guangfu"},
72
+ ]
73
+
74
+ context["ti"].xcom_push(
75
+ key="number_of_people_in_space", value=number_of_people_in_space
76
+ )
77
+ return list_of_people_in_space
78
+
79
+ @task
80
+ def print_astronaut_craft(greeting: str, person_in_space: dict) -> None:
81
+ """
82
+ This task creates a print statement with the name of an
83
+ Astronaut in space and the craft they are flying on from
84
+ the API request results of the previous task, along with a
85
+ greeting which is hard-coded in this example.
86
+ """
87
+ craft = person_in_space["craft"]
88
+ name = person_in_space["name"]
89
+
90
+ print(f"{name} is currently in space flying on the {craft}! {greeting}")
91
+
92
+ # Use dynamic task mapping to run the print_astronaut_craft task for each
93
+ # Astronaut in space
94
+ print_astronaut_craft.partial(greeting="Hello! :)").expand(
95
+ person_in_space=get_astronauts() # Define dependencies using TaskFlow API syntax
96
+ )
97
+
98
+
99
+ # Instantiate the DAG
100
+ example_astronauts()
models/anomaly-detection/dags/train_anomaly_model.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/dags/train_anomaly_model.py
3
+ Apache Airflow DAG for scheduled anomaly detection model training
4
+ Uses Astronomer (Astro) for deployment
5
+ """
6
+ from datetime import datetime, timedelta
7
+ from airflow import DAG
8
+ from airflow.operators.python import PythonOperator, BranchPythonOperator
9
+ from airflow.operators.empty import EmptyOperator
10
+ from airflow.sensors.python import PythonSensor
11
+ import os
12
+ import sys
13
+ import logging
14
+
15
+ # Add project to path
16
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17
+ sys.path.insert(0, PROJECT_ROOT)
18
+
19
+ # Load .env from root ModelX directory for MLflow credentials
20
+ try:
21
+ from dotenv import load_dotenv
22
+ root_env = os.path.join(PROJECT_ROOT, '..', '..', '.env')
23
+ if os.path.exists(root_env):
24
+ load_dotenv(root_env)
25
+ else:
26
+ load_dotenv() # Try default locations
27
+ except ImportError:
28
+ pass
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Configuration
33
+ BATCH_THRESHOLD = int(os.getenv("BATCH_THRESHOLD", "1000"))
34
+ SQLITE_DB_PATH = os.getenv("SQLITE_DB_PATH", "")
35
+
36
+ # Default DAG arguments
37
+ default_args = {
38
+ 'owner': 'modelx',
39
+ 'depends_on_past': False,
40
+ 'email_on_failure': False,
41
+ 'email_on_retry': False,
42
+ 'retries': 2,
43
+ 'retry_delay': timedelta(minutes=5),
44
+ }
45
+
46
+
47
+ def check_new_records(**context) -> bool:
48
+ """
49
+ Sensor function to check if enough new records exist.
50
+ Returns True if batch threshold is met or daily run is due.
51
+ """
52
+ import sqlite3
53
+ from datetime import datetime, timedelta
54
+
55
+ try:
56
+ # Get last training timestamp from XCom or default to 24h ago
57
+ last_training = context['ti'].xcom_pull(key='last_training_timestamp')
58
+ if not last_training:
59
+ last_training = (datetime.utcnow() - timedelta(hours=24)).isoformat()
60
+
61
+ # Check SQLite for new records
62
+ if SQLITE_DB_PATH and os.path.exists(SQLITE_DB_PATH):
63
+ conn = sqlite3.connect(SQLITE_DB_PATH)
64
+ cursor = conn.execute(
65
+ 'SELECT COUNT(*) FROM seen_hashes WHERE last_seen > ?',
66
+ (last_training,)
67
+ )
68
+ new_records = cursor.fetchone()[0]
69
+ conn.close()
70
+
71
+ logger.info(f"[AnomalyDAG] New records since {last_training}: {new_records}")
72
+
73
+ if new_records >= BATCH_THRESHOLD:
74
+ logger.info(f"[AnomalyDAG] Batch threshold met ({new_records} >= {BATCH_THRESHOLD})")
75
+ return True
76
+
77
+ # Check if 24 hours have passed (daily fallback)
78
+ if last_training:
79
+ last_dt = datetime.fromisoformat(last_training)
80
+ hours_since = (datetime.utcnow() - last_dt).total_seconds() / 3600
81
+ if hours_since >= 24:
82
+ logger.info(f"[AnomalyDAG] Daily run triggered ({hours_since:.1f}h since last run)")
83
+ return True
84
+
85
+ logger.info(f"[AnomalyDAG] Waiting for more records...")
86
+ return False
87
+
88
+ except Exception as e:
89
+ logger.error(f"[AnomalyDAG] Error checking records: {e}")
90
+ # Trigger anyway on error
91
+ return True
92
+
93
+
94
+ def run_data_ingestion(**context):
95
+ """Run data ingestion step"""
96
+ from src.components import DataIngestion
97
+ from src.entity import DataIngestionConfig
98
+
99
+ config = DataIngestionConfig()
100
+ ingestion = DataIngestion(config)
101
+ artifact = ingestion.ingest()
102
+
103
+ # Store artifact path in XCom
104
+ context['ti'].xcom_push(key='ingestion_artifact', value={
105
+ 'raw_data_path': artifact.raw_data_path,
106
+ 'total_records': artifact.total_records,
107
+ 'is_data_available': artifact.is_data_available
108
+ })
109
+
110
+ if not artifact.is_data_available:
111
+ raise ValueError("No data available for training")
112
+
113
+ return artifact.raw_data_path
114
+
115
+
116
+ def run_data_validation(**context):
117
+ """Run data validation step"""
118
+ from src.components import DataValidation
119
+ from src.entity import DataValidationConfig
120
+
121
+ # Get ingestion output from XCom
122
+ ingestion = context['ti'].xcom_pull(key='ingestion_artifact', task_ids='data_ingestion')
123
+ raw_data_path = ingestion['raw_data_path']
124
+
125
+ config = DataValidationConfig()
126
+ validation = DataValidation(config)
127
+ artifact = validation.validate(raw_data_path)
128
+
129
+ # Store artifact in XCom
130
+ context['ti'].xcom_push(key='validation_artifact', value={
131
+ 'validated_data_path': artifact.validated_data_path,
132
+ 'validation_status': artifact.validation_status,
133
+ 'valid_records': artifact.valid_records
134
+ })
135
+
136
+ return artifact.validated_data_path
137
+
138
+
139
+ def run_data_transformation(**context):
140
+ """Run data transformation step"""
141
+ from src.components import DataTransformation
142
+ from src.entity import DataTransformationConfig
143
+
144
+ # Get validation output from XCom
145
+ validation = context['ti'].xcom_pull(key='validation_artifact', task_ids='data_validation')
146
+ validated_data_path = validation['validated_data_path']
147
+
148
+ config = DataTransformationConfig()
149
+ transformation = DataTransformation(config)
150
+ artifact = transformation.transform(validated_data_path)
151
+
152
+ # Store artifact in XCom
153
+ context['ti'].xcom_push(key='transformation_artifact', value={
154
+ 'feature_store_path': artifact.feature_store_path,
155
+ 'language_distribution': artifact.language_distribution,
156
+ 'total_records': artifact.total_records
157
+ })
158
+
159
+ return artifact.feature_store_path
160
+
161
+
162
+ def run_model_training(**context):
163
+ """Run model training with Optuna and MLflow"""
164
+ from src.components import ModelTrainer
165
+ from src.entity import ModelTrainerConfig
166
+ from datetime import datetime
167
+
168
+ # Get transformation output from XCom
169
+ transformation = context['ti'].xcom_pull(key='transformation_artifact', task_ids='data_transformation')
170
+ feature_path = transformation['feature_store_path']
171
+
172
+ config = ModelTrainerConfig()
173
+ trainer = ModelTrainer(config)
174
+ artifact = trainer.train(feature_path)
175
+
176
+ # Store training timestamp for next run
177
+ context['ti'].xcom_push(key='last_training_timestamp', value=datetime.utcnow().isoformat())
178
+
179
+ # Store artifact in XCom
180
+ context['ti'].xcom_push(key='training_artifact', value={
181
+ 'best_model_name': artifact.best_model_name,
182
+ 'best_model_path': artifact.best_model_path,
183
+ 'mlflow_run_id': artifact.mlflow_run_id,
184
+ 'n_anomalies': artifact.n_anomalies
185
+ })
186
+
187
+ return artifact.best_model_path
188
+
189
+
190
+ # Create DAG
191
+ with DAG(
192
+ 'anomaly_detection_training',
193
+ default_args=default_args,
194
+ description='Train anomaly detection models on feed data',
195
+ schedule_interval=timedelta(hours=4), # Check every 4 hours
196
+ start_date=datetime(2024, 1, 1),
197
+ catchup=False,
198
+ tags=['ml', 'anomaly', 'modelx'],
199
+ ) as dag:
200
+
201
+ # Start
202
+ start = EmptyOperator(task_id='start')
203
+
204
+ # Sensor: Check for new records
205
+ check_records = PythonSensor(
206
+ task_id='check_new_records',
207
+ python_callable=check_new_records,
208
+ timeout=3600,
209
+ poke_interval=300, # Check every 5 minutes
210
+ mode='poke',
211
+ )
212
+
213
+ # Data Ingestion
214
+ data_ingestion = PythonOperator(
215
+ task_id='data_ingestion',
216
+ python_callable=run_data_ingestion,
217
+ )
218
+
219
+ # Data Validation
220
+ data_validation = PythonOperator(
221
+ task_id='data_validation',
222
+ python_callable=run_data_validation,
223
+ )
224
+
225
+ # Data Transformation
226
+ data_transformation = PythonOperator(
227
+ task_id='data_transformation',
228
+ python_callable=run_data_transformation,
229
+ )
230
+
231
+ # Model Training
232
+ model_training = PythonOperator(
233
+ task_id='model_training',
234
+ python_callable=run_model_training,
235
+ )
236
+
237
+ # End
238
+ end = EmptyOperator(task_id='end')
239
+
240
+ # Pipeline flow
241
+ start >> check_records >> data_ingestion >> data_validation >> data_transformation >> model_training >> end
models/anomaly-detection/data_schema/schema.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Schema for Anomaly Detection Pipeline
2
+ # Based on the feeds dataset created by the combined graph
3
+
4
+ feed_columns:
5
+ # Required columns
6
+ post_id:
7
+ dtype: str
8
+ required: true
9
+ description: "Unique identifier for the post"
10
+
11
+ timestamp:
12
+ dtype: str # ISO format or Unix timestamp
13
+ required: true
14
+ description: "Post timestamp"
15
+
16
+ platform:
17
+ dtype: str
18
+ required: true
19
+ allowed_values: ["reddit", "facebook", "twitter", "linkedin", "instagram", "web"]
20
+ description: "Source platform"
21
+
22
+ category:
23
+ dtype: str
24
+ required: true
25
+ description: "Post category (competitor_mention, profile_monitoring, etc.)"
26
+
27
+ text:
28
+ dtype: str
29
+ required: true
30
+ min_length: 10
31
+ max_length: 5000
32
+ description: "Main text content"
33
+
34
+ content_hash:
35
+ dtype: str
36
+ required: true
37
+ description: "MD5/SHA256 hash for deduplication"
38
+
39
+ # Optional columns
40
+ entity:
41
+ dtype: str
42
+ required: false
43
+ description: "Referenced entity (Dialog, SLT, etc.)"
44
+
45
+ poster:
46
+ dtype: str
47
+ required: false
48
+ description: "Author/poster username"
49
+
50
+ post_url:
51
+ dtype: str
52
+ required: false
53
+ description: "URL to original post"
54
+
55
+ title:
56
+ dtype: str
57
+ required: false
58
+ description: "Post title if available"
59
+
60
+ engagement_score:
61
+ dtype: float
62
+ required: false
63
+ min_value: 0
64
+ description: "Overall engagement score"
65
+
66
+ engagement_likes:
67
+ dtype: int
68
+ required: false
69
+ min_value: 0
70
+ description: "Number of likes"
71
+
72
+ engagement_shares:
73
+ dtype: int
74
+ required: false
75
+ min_value: 0
76
+ description: "Number of shares"
77
+
78
+ engagement_comments:
79
+ dtype: int
80
+ required: false
81
+ min_value: 0
82
+ description: "Number of comments"
83
+
84
+ source_tool:
85
+ dtype: str
86
+ required: false
87
+ description: "Tool used for scraping (scrape_reddit, scrape_facebook_profile, etc.)"
88
+
89
+ # SQLite schema for feed cache
90
+ sqlite_schema:
91
+ table: seen_hashes
92
+ columns:
93
+ - content_hash: TEXT PRIMARY KEY
94
+ - first_seen: TIMESTAMP
95
+ - last_seen: TIMESTAMP
96
+ - event_id: TEXT
97
+ - summary_preview: TEXT
98
+
99
+ # Feature engineering configuration
100
+ features:
101
+ temporal:
102
+ - hour_of_day
103
+ - day_of_week
104
+ - is_weekend
105
+ - is_business_hours
106
+
107
+ engagement:
108
+ - normalized_score
109
+ - log_engagement
110
+ - engagement_ratio
111
+
112
+ text:
113
+ - language # en, si, ta
114
+ - vector_embedding # 768-dim for BERT models
115
+ - text_length
116
+ - word_count
117
+
118
+ # Language detection configuration
119
+ languages:
120
+ supported:
121
+ - code: en
122
+ name: English
123
+ model: distilbert-base-uncased
124
+ - code: si
125
+ name: Sinhala
126
+ model: keshan/SinhalaBERTo
127
+ - code: ta
128
+ name: Tamil
129
+ model: l3cube-pune/tamil-bert
130
+
131
+ detection:
132
+ method: fasttext # or lingua-py
133
+ min_confidence: 0.8
models/anomaly-detection/download_models.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/download_models.py
3
+ Script to pre-download all required models for the pipeline.
4
+ """
5
+ import os
6
+ import sys
7
+ import requests
8
+ import logging
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+
12
+ # Add src to path
13
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(message)s')
17
+ logger = logging.getLogger("downloader")
18
+
19
+ # Constants
20
+ CACHE_DIR = Path(__file__).parent / "models_cache"
21
+ FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
22
+ FASTTEXT_PATH = CACHE_DIR / "lid.176.bin"
23
+
24
+ def download_file(url, destination):
25
+ """Download file with progress bar"""
26
+ response = requests.get(url, stream=True)
27
+ total_size = int(response.headers.get('content-length', 0))
28
+
29
+ with open(destination, 'wb') as file, tqdm(
30
+ desc=destination.name,
31
+ total=total_size,
32
+ unit='iB',
33
+ unit_scale=True,
34
+ unit_divisor=1024,
35
+ ) as bar:
36
+ for data in response.iter_content(chunk_size=1024):
37
+ size = file.write(data)
38
+ bar.update(size)
39
+
40
+ def main():
41
+ logger.info("=" * 50)
42
+ logger.info("⬇️ MODEL DOWNLOADER")
43
+ logger.info("=" * 50)
44
+
45
+ # Ensure cache directory exists
46
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
47
+ logger.info(f"📂 Cache Directory: {CACHE_DIR}")
48
+
49
+ # 1. Download FastText Model
50
+ logger.info("\n[1/2] Checking FastText Model (Language Detection)...")
51
+ if not FASTTEXT_PATH.exists():
52
+ logger.info(f" Downloading lid.176.bin...")
53
+ try:
54
+ download_file(FASTTEXT_URL, FASTTEXT_PATH)
55
+ logger.info(" ✅ Download complete")
56
+ except Exception as e:
57
+ logger.error(f" ❌ Failed to download FastText: {e}")
58
+ else:
59
+ logger.info(" ✅ FastText model already exists")
60
+
61
+ # 2. Download HuggingFace Models
62
+ logger.info("\n[2/2] Checking HuggingFace BERT Models (Vectorization)...")
63
+ try:
64
+ from src.utils.vectorizer import get_vectorizer
65
+
66
+ # Initialize vectorizer which handles HF downloads
67
+ logger.info(" Initializing vectorizer to trigger downloads...")
68
+ vectorizer = get_vectorizer(models_cache_dir=str(CACHE_DIR))
69
+
70
+ # Trigger downloads for all languages
71
+ vectorizer.download_all_models()
72
+
73
+ logger.info(" ✅ All BERT models ready")
74
+
75
+ except ImportError:
76
+ logger.error(" ❌ Could not import vectorizer. Install requirements first:")
77
+ logger.error(" pip install -r requirements.txt")
78
+ except Exception as e:
79
+ logger.error(f" ❌ Error downloading BERT models: {e}")
80
+
81
+ logger.info("\n" + "=" * 50)
82
+ logger.info("✨ SETUP COMPLETE")
83
+ logger.info("=" * 50)
84
+
85
+ if __name__ == "__main__":
86
+ main()
models/anomaly-detection/main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/main.py
3
+ Entry point for the anomaly detection training pipeline
4
+ """
5
+ import os
6
+ import sys
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ # Add src to path
11
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
12
+
13
+ from src.pipeline import run_training_pipeline
14
+ from src.entity import PipelineConfig
15
+
16
+ # Configure logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
+ handlers=[
21
+ logging.StreamHandler(),
22
+ logging.FileHandler("training.log")
23
+ ]
24
+ )
25
+
26
+ logger = logging.getLogger("main")
27
+
28
+
29
+ def main():
30
+ """Run the anomaly detection training pipeline"""
31
+ logger.info("=" * 60)
32
+ logger.info("ANOMALY DETECTION PIPELINE")
33
+ logger.info("=" * 60)
34
+
35
+ # Load environment variables
36
+ from dotenv import load_dotenv
37
+ load_dotenv()
38
+
39
+ # Create configuration
40
+ config = PipelineConfig()
41
+
42
+ # Run pipeline
43
+ try:
44
+ artifact = run_training_pipeline(config)
45
+
46
+ logger.info("\n" + "=" * 60)
47
+ logger.info("PIPELINE RESULTS")
48
+ logger.info("=" * 60)
49
+ logger.info(f"Status: {artifact.pipeline_status}")
50
+ logger.info(f"Run ID: {artifact.pipeline_run_id}")
51
+ logger.info(f"Duration: {artifact.pipeline_start_time} to {artifact.pipeline_end_time}")
52
+
53
+ logger.info("\n--- Data Ingestion ---")
54
+ logger.info(f"Total records: {artifact.data_ingestion.total_records}")
55
+ logger.info(f"From SQLite: {artifact.data_ingestion.records_from_sqlite}")
56
+ logger.info(f"From CSV: {artifact.data_ingestion.records_from_csv}")
57
+
58
+ logger.info("\n--- Data Validation ---")
59
+ logger.info(f"Valid records: {artifact.data_validation.valid_records}")
60
+ logger.info(f"Validation status: {artifact.data_validation.validation_status}")
61
+
62
+ logger.info("\n--- Data Transformation ---")
63
+ logger.info(f"Language distribution: {artifact.data_transformation.language_distribution}")
64
+
65
+ logger.info("\n--- Model Training ---")
66
+ logger.info(f"Best model: {artifact.model_trainer.best_model_name}")
67
+ logger.info(f"Best metrics: {artifact.model_trainer.best_model_metrics}")
68
+ logger.info(f"MLflow run: {artifact.model_trainer.mlflow_run_id}")
69
+
70
+ if artifact.model_trainer.n_anomalies:
71
+ logger.info(f"Anomalies detected: {artifact.model_trainer.n_anomalies}")
72
+
73
+ logger.info("\n" + "=" * 60)
74
+ logger.info("PIPELINE COMPLETE")
75
+ logger.info("=" * 60)
76
+
77
+ return artifact
78
+
79
+ except Exception as e:
80
+ logger.error(f"Pipeline failed: {e}")
81
+ raise
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
models/anomaly-detection/packages.txt ADDED
File without changes
models/anomaly-detection/requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Anomaly Detection Pipeline - Requirements
2
+
3
+ # ML & Clustering
4
+ optuna>=3.0
5
+ scikit-learn>=1.0
6
+ hdbscan>=0.8.29
7
+ umap-learn>=0.5
8
+
9
+ # Language Detection
10
+ fasttext-wheel>=0.9.2
11
+ lingua-language-detector>=2.0
12
+
13
+ # NLP & Transformers
14
+ transformers>=4.30
15
+ sentence-transformers>=2.0
16
+ torch>=2.0
17
+
18
+ # MLflow & Tracking
19
+ mlflow>=2.0
20
+ dagshub>=0.3
21
+
22
+ # Database
23
+ pymongo>=4.0
24
+
25
+ # Data Processing
26
+ pandas>=2.0
27
+ numpy>=1.24
28
+ pyyaml>=6.0
29
+
30
+ # Airflow (managed by Astro)
31
+ # apache-airflow>=2.7
32
+
33
+ # Utilities
34
+ joblib>=1.3
35
+ tqdm>=4.65
36
+
37
+ astro-run-dag # This package is needed for the astro run command. It will be removed before a deploy
models/anomaly-detection/src/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/__init__.py
3
+ Anomaly Detection Pipeline Package
4
+ """
5
+
6
+ from .components.data_ingestion import DataIngestion
7
+ from .components.data_validation import DataValidation
8
+ from .components.data_transformation import DataTransformation
9
+ from .components.model_trainer import ModelTrainer
10
+
11
+ __all__ = [
12
+ "DataIngestion",
13
+ "DataValidation",
14
+ "DataTransformation",
15
+ "ModelTrainer"
16
+ ]
17
+
18
+ __version__ = "1.0.0"
models/anomaly-detection/src/components/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/components/__init__.py
3
+
4
+ Sets up paths for integration with main project before importing components.
5
+ """
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add main project root to path for vectorization agent graph access
10
+ # Path: models/anomaly-detection/src/components/__init__.py -> go up 4 levels to ModelX-Ultimate
11
+ # Note: This is secondary to anomaly-detection path. Direct graph import won't work
12
+ # due to 'src' namespace collision. Use VectorizationAPI HTTP calls instead.
13
+ _main_project_root = Path(__file__).parent.parent.parent.parent.parent
14
+ _main_path = str(_main_project_root)
15
+ if _main_path not in sys.path:
16
+ sys.path.append(_main_path) # Append, don't insert at 0
17
+
18
+ from .data_ingestion import DataIngestion
19
+ from .data_validation import DataValidation
20
+ from .data_transformation import DataTransformation
21
+ from .model_trainer import ModelTrainer
22
+
23
+ __all__ = [
24
+ "DataIngestion",
25
+ "DataValidation",
26
+ "DataTransformation",
27
+ "ModelTrainer"
28
+ ]
29
+
models/anomaly-detection/src/components/data_ingestion.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/components/data_ingestion.py
3
+ Data ingestion from SQLite feed cache and CSV files
4
+ """
5
+ import os
6
+ import sqlite3
7
+ import pandas as pd
8
+ import logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ from ..entity import DataIngestionConfig, DataIngestionArtifact
14
+
15
+ logger = logging.getLogger("data_ingestion")
16
+
17
+
18
+ class DataIngestion:
19
+ """
20
+ Data ingestion component that fetches feed data from:
21
+ 1. SQLite database (feed_cache.db) - production deduped feeds
22
+ 2. CSV files in datasets/political_feeds/ - historical data
23
+ """
24
+
25
+ def __init__(self, config: Optional[DataIngestionConfig] = None):
26
+ """
27
+ Initialize data ingestion component.
28
+
29
+ Args:
30
+ config: Optional configuration, uses defaults if None
31
+ """
32
+ self.config = config or DataIngestionConfig()
33
+
34
+ # Ensure output directory exists
35
+ Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
36
+
37
+ logger.info(f"[DataIngestion] Initialized")
38
+ logger.info(f" SQLite: {self.config.sqlite_db_path}")
39
+ logger.info(f" CSV Dir: {self.config.csv_directory}")
40
+ logger.info(f" Output: {self.config.output_directory}")
41
+
42
+ def _fetch_from_sqlite(self) -> pd.DataFrame:
43
+ """
44
+ Fetch feed data from SQLite cache database.
45
+
46
+ Returns:
47
+ DataFrame with feed records
48
+ """
49
+ db_path = self.config.sqlite_db_path
50
+
51
+ if not os.path.exists(db_path):
52
+ logger.warning(f"[DataIngestion] SQLite DB not found: {db_path}")
53
+ return pd.DataFrame()
54
+
55
+ try:
56
+ conn = sqlite3.connect(db_path)
57
+
58
+ # Query the seen_hashes table
59
+ query = """
60
+ SELECT
61
+ content_hash as post_id,
62
+ first_seen as timestamp,
63
+ event_id,
64
+ summary_preview as text
65
+ FROM seen_hashes
66
+ ORDER BY last_seen DESC
67
+ """
68
+ df = pd.read_sql_query(query, conn)
69
+ conn.close()
70
+
71
+ # Add default columns for compatibility
72
+ if not df.empty:
73
+ df["platform"] = "mixed"
74
+ df["category"] = "feed"
75
+ df["content_hash"] = df["post_id"]
76
+ df["source"] = "sqlite"
77
+
78
+ logger.info(f"[DataIngestion] Fetched {len(df)} records from SQLite")
79
+ return df
80
+
81
+ except Exception as e:
82
+ logger.error(f"[DataIngestion] SQLite error: {e}")
83
+ return pd.DataFrame()
84
+
85
+ def _fetch_from_csv(self) -> pd.DataFrame:
86
+ """
87
+ Fetch feed data from CSV files in datasets directory.
88
+
89
+ Returns:
90
+ Combined DataFrame from all CSV files
91
+ """
92
+ csv_dir = Path(self.config.csv_directory)
93
+
94
+ if not csv_dir.exists():
95
+ logger.warning(f"[DataIngestion] CSV directory not found: {csv_dir}")
96
+ return pd.DataFrame()
97
+
98
+ all_dfs = []
99
+ csv_files = list(csv_dir.glob("*.csv"))
100
+
101
+ for csv_file in csv_files:
102
+ try:
103
+ df = pd.read_csv(csv_file)
104
+ df["source_file"] = csv_file.name
105
+ df["source"] = "csv"
106
+ all_dfs.append(df)
107
+ logger.info(f"[DataIngestion] Loaded {len(df)} records from {csv_file.name}")
108
+ except Exception as e:
109
+ logger.warning(f"[DataIngestion] Failed to load {csv_file}: {e}")
110
+
111
+ if not all_dfs:
112
+ return pd.DataFrame()
113
+
114
+ combined = pd.concat(all_dfs, ignore_index=True)
115
+ logger.info(f"[DataIngestion] Total {len(combined)} records from {len(csv_files)} CSV files")
116
+ return combined
117
+
118
+ def _deduplicate(self, df: pd.DataFrame) -> pd.DataFrame:
119
+ """
120
+ Remove duplicate records based on content_hash.
121
+
122
+ Args:
123
+ df: Input DataFrame
124
+
125
+ Returns:
126
+ Deduplicated DataFrame
127
+ """
128
+ if df.empty:
129
+ return df
130
+
131
+ initial_count = len(df)
132
+
133
+ # Use content_hash for deduplication, fallback to post_id
134
+ if "content_hash" in df.columns:
135
+ df = df.drop_duplicates(subset=["content_hash"], keep="first")
136
+ elif "post_id" in df.columns:
137
+ df = df.drop_duplicates(subset=["post_id"], keep="first")
138
+
139
+ deduped_count = len(df)
140
+ removed = initial_count - deduped_count
141
+
142
+ if removed > 0:
143
+ logger.info(f"[DataIngestion] Deduplicated: removed {removed} duplicates")
144
+
145
+ return df
146
+
147
+ def _filter_valid_records(self, df: pd.DataFrame) -> pd.DataFrame:
148
+ """
149
+ Filter records with sufficient text content.
150
+
151
+ Args:
152
+ df: Input DataFrame
153
+
154
+ Returns:
155
+ Filtered DataFrame
156
+ """
157
+ if df.empty:
158
+ return df
159
+
160
+ initial_count = len(df)
161
+
162
+ # Ensure text column exists
163
+ if "text" not in df.columns:
164
+ # Try alternative column names
165
+ text_cols = ["summary_preview", "title", "content"]
166
+ for col in text_cols:
167
+ if col in df.columns:
168
+ df["text"] = df[col]
169
+ break
170
+
171
+ if "text" not in df.columns:
172
+ logger.warning("[DataIngestion] No text column found")
173
+ df["text"] = ""
174
+
175
+ # Filter by minimum text length
176
+ df = df[df["text"].str.len() >= self.config.min_text_length]
177
+
178
+ filtered_count = len(df)
179
+ removed = initial_count - filtered_count
180
+
181
+ if removed > 0:
182
+ logger.info(f"[DataIngestion] Filtered: removed {removed} short texts")
183
+
184
+ return df
185
+
186
+ def ingest(self) -> DataIngestionArtifact:
187
+ """
188
+ Execute data ingestion pipeline.
189
+
190
+ Returns:
191
+ DataIngestionArtifact with paths and statistics
192
+ """
193
+ logger.info("[DataIngestion] Starting data ingestion...")
194
+
195
+ # Fetch from both sources
196
+ sqlite_df = self._fetch_from_sqlite()
197
+ csv_df = self._fetch_from_csv()
198
+
199
+ records_from_sqlite = len(sqlite_df)
200
+ records_from_csv = len(csv_df)
201
+
202
+ # Combine sources
203
+ if not sqlite_df.empty and not csv_df.empty:
204
+ # Ensure compatible columns
205
+ common_cols = list(set(sqlite_df.columns) & set(csv_df.columns))
206
+ combined_df = pd.concat([
207
+ sqlite_df[common_cols],
208
+ csv_df[common_cols]
209
+ ], ignore_index=True)
210
+ elif not sqlite_df.empty:
211
+ combined_df = sqlite_df
212
+ elif not csv_df.empty:
213
+ combined_df = csv_df
214
+ else:
215
+ combined_df = pd.DataFrame()
216
+
217
+ # Deduplicate
218
+ combined_df = self._deduplicate(combined_df)
219
+
220
+ # Filter valid records
221
+ combined_df = self._filter_valid_records(combined_df)
222
+
223
+ total_records = len(combined_df)
224
+ is_data_available = total_records > 0
225
+
226
+ # Save to output
227
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
228
+ output_path = Path(self.config.output_directory) / f"ingested_data_{timestamp}.parquet"
229
+
230
+ if is_data_available:
231
+ combined_df.to_parquet(output_path, index=False)
232
+ logger.info(f"[DataIngestion] Saved {total_records} records to {output_path}")
233
+ else:
234
+ output_path = str(output_path)
235
+ logger.warning("[DataIngestion] No data available to save")
236
+
237
+ artifact = DataIngestionArtifact(
238
+ raw_data_path=str(output_path),
239
+ total_records=total_records,
240
+ records_from_sqlite=records_from_sqlite,
241
+ records_from_csv=records_from_csv,
242
+ ingestion_timestamp=timestamp,
243
+ is_data_available=is_data_available
244
+ )
245
+
246
+ logger.info(f"[DataIngestion] ✓ Complete: {total_records} records")
247
+ return artifact
models/anomaly-detection/src/components/data_transformation.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/components/data_transformation.py
3
+ Data transformation with language detection and text vectorization
4
+ Integrates with Vectorization Agent Graph for LLM-enhanced processing
5
+ """
6
+ import os
7
+ import pandas as pd
8
+ import numpy as np
9
+ import logging
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, Any, List
13
+ from tqdm import tqdm
14
+
15
+ from ..entity import DataTransformationConfig, DataTransformationArtifact
16
+ from ..utils import detect_language, get_vectorizer
17
+
18
+ logger = logging.getLogger("data_transformation")
19
+
20
+
21
+ class DataTransformation:
22
+ """
23
+ Data transformation component that:
24
+ 1. Detects language (Sinhala/Tamil/English)
25
+ 2. Extracts text embeddings using language-specific BERT models
26
+ 3. Engineers temporal and engagement features
27
+ 4. Optionally integrates with Vectorizer Agent Graph for LLM insights
28
+ """
29
+
30
+ def __init__(self, config: Optional[DataTransformationConfig] = None, use_agent_graph: bool = True):
31
+ """
32
+ Initialize data transformation component.
33
+
34
+ Args:
35
+ config: Optional configuration, uses defaults if None
36
+ use_agent_graph: If True, use vectorizer agent graph for processing
37
+ """
38
+ self.config = config or DataTransformationConfig()
39
+ self.use_agent_graph = use_agent_graph
40
+
41
+ # Ensure output directory exists
42
+ Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
43
+
44
+ # Get vectorizer (lazy loaded)
45
+ self.vectorizer = get_vectorizer(self.config.models_cache_dir)
46
+
47
+ # Vectorization API integration
48
+ # Note: Direct import of vectorizationAgentGraph fails due to 'src' namespace collision
49
+ # between this project (models/anomaly-detection/src) and main project (src).
50
+ # Instead, we call the Vectorization API via HTTP when available.
51
+ self.vectorizer_graph = None # Not used - we use HTTP API instead
52
+ self.vectorization_api_url = os.getenv("VECTORIZATION_API_URL", "http://localhost:8001")
53
+ self.vectorization_api_available = False
54
+
55
+ if self.use_agent_graph:
56
+ # Check if vectorization API is available
57
+ try:
58
+ import requests
59
+ response = requests.get(f"{self.vectorization_api_url}/health", timeout=10)
60
+ if response.status_code == 200:
61
+ self.vectorization_api_available = True
62
+ logger.info(f"[DataTransformation] [OK] Vectorization API available at {self.vectorization_api_url}")
63
+ else:
64
+ logger.warning(f"[DataTransformation] Vectorization API returned status {response.status_code}")
65
+ except Exception as e:
66
+ logger.warning(f"[DataTransformation] Vectorization API not available: {e}")
67
+ logger.info("[DataTransformation] Using local vectorization (no LLM insights)")
68
+
69
+ logger.info(f"[DataTransformation] Initialized")
70
+ logger.info(f" Models cache: {self.config.models_cache_dir}")
71
+ logger.info(f" Vectorization API: {'enabled' if self.vectorization_api_available else 'disabled (using local)'}")
72
+
73
+ def _process_with_agent_graph(self, texts: List[Dict[str, Any]]) -> Dict[str, Any]:
74
+ """
75
+ Process texts through the Vectorization API.
76
+
77
+ Uses HTTP calls to the vectorization API server which runs the
78
+ Vectorizer Agent Graph. This avoids the 'src' namespace collision.
79
+
80
+ This provides:
81
+ - Language detection
82
+ - Vector embeddings
83
+ - LLM expert summary
84
+ - Opportunity/threat analysis
85
+
86
+ Args:
87
+ texts: List of {text, post_id, metadata} dicts
88
+
89
+ Returns:
90
+ Dict with language_detection_results, vector_embeddings, expert_summary, etc.
91
+ """
92
+ if not self.vectorization_api_available:
93
+ logger.warning("[DataTransformation] Vectorization API not available, using fallback")
94
+ return None
95
+
96
+ try:
97
+ import requests
98
+
99
+ batch_id = datetime.now().strftime("%Y%m%d_%H%M%S")
100
+
101
+ # Prepare request payload
102
+ payload = {
103
+ "texts": [
104
+ {
105
+ "text": item.get("text", ""),
106
+ "post_id": item.get("post_id", f"text_{i}"),
107
+ "metadata": item.get("metadata", {})
108
+ }
109
+ for i, item in enumerate(texts)
110
+ ],
111
+ "batch_id": batch_id,
112
+ "include_vectors": True,
113
+ "include_expert_summary": True
114
+ }
115
+
116
+ # Call vectorization API
117
+ response = requests.post(
118
+ f"{self.vectorization_api_url}/vectorize",
119
+ json=payload,
120
+ timeout=120 # 2 minutes for large batches
121
+ )
122
+
123
+ if response.status_code == 200:
124
+ result = response.json()
125
+ logger.info(f"[DataTransformation] Vectorization API processed {len(texts)} texts")
126
+
127
+ # Convert API response to expected format
128
+ return {
129
+ "language_detection_results": result.get("vectors", []),
130
+ "vector_embeddings": result.get("vectors", []),
131
+ "expert_summary": result.get("expert_summary", ""),
132
+ "opportunities": [], # Extracted from domain_insights
133
+ "threats": [], # Extracted from domain_insights
134
+ "domain_insights": result.get("domain_insights", []),
135
+ "processing_stats": {
136
+ "language_distribution": result.get("language_distribution", {}),
137
+ "processing_time": result.get("processing_time_seconds", 0)
138
+ }
139
+ }
140
+ else:
141
+ logger.error(f"[DataTransformation] Vectorization API error: {response.status_code}")
142
+ return None
143
+
144
+ except Exception as e:
145
+ logger.error(f"[DataTransformation] Vectorization API call failed: {e}")
146
+ return None
147
+
148
+ def _detect_languages(self, df: pd.DataFrame) -> pd.DataFrame:
149
+ """
150
+ Detect language for each text entry.
151
+
152
+ Args:
153
+ df: Input DataFrame with 'text' column
154
+
155
+ Returns:
156
+ DataFrame with 'language' and 'language_confidence' columns
157
+ """
158
+ logger.info("[DataTransformation] Detecting languages...")
159
+
160
+ languages = []
161
+ confidences = []
162
+
163
+ for text in tqdm(df["text"].fillna(""), desc="Language Detection"):
164
+ lang, conf = detect_language(text)
165
+ languages.append(lang)
166
+ confidences.append(conf)
167
+
168
+ df["language"] = languages
169
+ df["language_confidence"] = confidences
170
+
171
+ # Log distribution
172
+ lang_counts = df["language"].value_counts()
173
+ logger.info(f"[DataTransformation] Language distribution:")
174
+ for lang, count in lang_counts.items():
175
+ logger.info(f" {lang}: {count} ({100*count/len(df):.1f}%)")
176
+
177
+ return df
178
+
179
+ def _extract_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
180
+ """
181
+ Extract temporal features from timestamp.
182
+
183
+ Args:
184
+ df: Input DataFrame with 'timestamp' column
185
+
186
+ Returns:
187
+ DataFrame with temporal feature columns
188
+ """
189
+ logger.info("[DataTransformation] Extracting temporal features...")
190
+
191
+ if "timestamp" not in df.columns:
192
+ logger.warning("[DataTransformation] No timestamp column found")
193
+ return df
194
+
195
+ # Convert to datetime
196
+ try:
197
+ df["datetime"] = pd.to_datetime(df["timestamp"], errors='coerce')
198
+ except Exception as e:
199
+ logger.warning(f"[DataTransformation] Timestamp conversion error: {e}")
200
+ return df
201
+
202
+ # Extract features
203
+ df["hour_of_day"] = df["datetime"].dt.hour.fillna(0).astype(int)
204
+ df["day_of_week"] = df["datetime"].dt.dayofweek.fillna(0).astype(int)
205
+ df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
206
+ df["is_business_hours"] = ((df["hour_of_day"] >= 9) & (df["hour_of_day"] <= 17)).astype(int)
207
+
208
+ # Drop intermediate column
209
+ df = df.drop(columns=["datetime"], errors='ignore')
210
+
211
+ return df
212
+
213
+ def _extract_engagement_features(self, df: pd.DataFrame) -> pd.DataFrame:
214
+ """
215
+ Extract and normalize engagement features.
216
+
217
+ Args:
218
+ df: Input DataFrame
219
+
220
+ Returns:
221
+ DataFrame with engagement feature columns
222
+ """
223
+ logger.info("[DataTransformation] Extracting engagement features...")
224
+
225
+ # Check for engagement columns
226
+ engagement_cols = ["engagement_score", "engagement_likes", "engagement_shares", "engagement_comments"]
227
+
228
+ for col in engagement_cols:
229
+ if col not in df.columns:
230
+ df[col] = 0
231
+
232
+ # Combined engagement score
233
+ df["total_engagement"] = (
234
+ df["engagement_likes"].fillna(0) +
235
+ df["engagement_shares"].fillna(0) * 2 + # Shares weighted more
236
+ df["engagement_comments"].fillna(0)
237
+ )
238
+
239
+ # Log transform for better distribution
240
+ df["log_engagement"] = np.log1p(df["total_engagement"])
241
+
242
+ # Normalize to 0-1 range
243
+ max_engagement = df["total_engagement"].max()
244
+ if max_engagement > 0:
245
+ df["normalized_engagement"] = df["total_engagement"] / max_engagement
246
+ else:
247
+ df["normalized_engagement"] = 0
248
+
249
+ return df
250
+
251
+ def _extract_text_features(self, df: pd.DataFrame) -> pd.DataFrame:
252
+ """
253
+ Extract basic text features.
254
+
255
+ Args:
256
+ df: Input DataFrame with 'text' column
257
+
258
+ Returns:
259
+ DataFrame with text feature columns
260
+ """
261
+ logger.info("[DataTransformation] Extracting text features...")
262
+
263
+ df["text_length"] = df["text"].fillna("").str.len()
264
+ df["word_count"] = df["text"].fillna("").str.split().str.len().fillna(0).astype(int)
265
+
266
+ return df
267
+
268
+ def _vectorize_texts(self, df: pd.DataFrame) -> np.ndarray:
269
+ """
270
+ Vectorize texts using language-specific BERT models.
271
+
272
+ Args:
273
+ df: Input DataFrame with 'text' and 'language' columns
274
+
275
+ Returns:
276
+ numpy array of shape (n_samples, 768)
277
+ """
278
+ logger.info("[DataTransformation] Vectorizing texts with BERT models...")
279
+
280
+ embeddings = []
281
+
282
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc="Text Vectorization"):
283
+ text = row.get("text", "")
284
+ language = row.get("language", "english")
285
+
286
+ try:
287
+ embedding = self.vectorizer.vectorize(text, language)
288
+ embeddings.append(embedding)
289
+ except Exception as e:
290
+ logger.debug(f"Vectorization error at {idx}: {e}")
291
+ embeddings.append(np.zeros(self.config.vector_dim))
292
+
293
+ return np.array(embeddings)
294
+
295
+ def _build_feature_matrix(self, df: pd.DataFrame, embeddings: np.ndarray) -> np.ndarray:
296
+ """
297
+ Combine all features into a single feature matrix.
298
+
299
+ Args:
300
+ df: DataFrame with engineered features
301
+ embeddings: Text embeddings array
302
+
303
+ Returns:
304
+ Combined feature matrix
305
+ """
306
+ logger.info("[DataTransformation] Building feature matrix...")
307
+
308
+ # Numeric features to include
309
+ numeric_cols = [
310
+ "hour_of_day", "day_of_week", "is_weekend", "is_business_hours",
311
+ "log_engagement", "normalized_engagement",
312
+ "text_length", "word_count"
313
+ ]
314
+
315
+ # Filter to available columns
316
+ available_cols = [col for col in numeric_cols if col in df.columns]
317
+
318
+ if available_cols:
319
+ numeric_features = df[available_cols].fillna(0).values
320
+ # Normalize numeric features
321
+ from sklearn.preprocessing import StandardScaler
322
+ scaler = StandardScaler()
323
+ numeric_features = scaler.fit_transform(numeric_features)
324
+ else:
325
+ numeric_features = np.zeros((len(df), 1))
326
+
327
+ # Combine with embeddings
328
+ feature_matrix = np.hstack([embeddings, numeric_features])
329
+
330
+ logger.info(f"[DataTransformation] Feature matrix shape: {feature_matrix.shape}")
331
+ return feature_matrix
332
+
333
+ def transform(self, data_path: str) -> DataTransformationArtifact:
334
+ """
335
+ Execute data transformation pipeline.
336
+ Integrates with Vectorizer Agent Graph for LLM-enhanced processing.
337
+
338
+ Args:
339
+ data_path: Path to validated data
340
+
341
+ Returns:
342
+ DataTransformationArtifact with paths and statistics
343
+ """
344
+ import json
345
+
346
+ logger.info(f"[DataTransformation] Starting transformation: {data_path}")
347
+
348
+ # Load data
349
+ df = pd.read_parquet(data_path)
350
+ total_records = len(df)
351
+ logger.info(f"[DataTransformation] Loaded {total_records} records")
352
+
353
+ # Initialize agent graph results
354
+ agent_result = None
355
+ expert_summary = None
356
+
357
+ # Try to process with vectorizer agent graph first
358
+ if self.vectorizer_graph and self.use_agent_graph:
359
+ logger.info("[DataTransformation] Using Vectorizer Agent Graph...")
360
+
361
+ # Prepare texts for agent graph
362
+ texts_for_agent = []
363
+ for idx, row in df.iterrows():
364
+ texts_for_agent.append({
365
+ "post_id": str(row.get("id", idx)),
366
+ "text": str(row.get("text", "")),
367
+ "metadata": {
368
+ "source": row.get("source", "unknown"),
369
+ "timestamp": str(row.get("timestamp", ""))
370
+ }
371
+ })
372
+
373
+ # Process through agent graph
374
+ agent_result = self._process_with_agent_graph(texts_for_agent)
375
+
376
+ if agent_result:
377
+ expert_summary = agent_result.get("expert_summary", "")
378
+ logger.info(f"[DataTransformation] Agent graph completed with expert summary")
379
+
380
+ # Run standard transformations (fallback or additional)
381
+ df = self._detect_languages(df)
382
+ df = self._extract_temporal_features(df)
383
+ df = self._extract_engagement_features(df)
384
+ df = self._extract_text_features(df)
385
+
386
+ # Vectorize texts (use agent result if available, otherwise fallback)
387
+ if agent_result and agent_result.get("vector_embeddings"):
388
+ # Extract vectors from agent graph result
389
+ agent_embeddings = agent_result.get("vector_embeddings", [])
390
+ embeddings = np.array([
391
+ item.get("vector", [0.0] * 768) for item in agent_embeddings
392
+ ])
393
+ logger.info(f"[DataTransformation] Using agent graph vectors: {embeddings.shape}")
394
+ else:
395
+ # Fallback to direct vectorization
396
+ embeddings = self._vectorize_texts(df)
397
+
398
+ # Build combined feature matrix
399
+ feature_matrix = self._build_feature_matrix(df, embeddings)
400
+
401
+ # Save outputs
402
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
403
+
404
+ # Save transformed dataframe
405
+ transformed_path = Path(self.config.output_directory) / f"transformed_data_{timestamp}.parquet"
406
+ df.to_parquet(transformed_path, index=False)
407
+
408
+ # Save embeddings
409
+ embeddings_path = Path(self.config.output_directory) / f"embeddings_{timestamp}.npy"
410
+ np.save(embeddings_path, embeddings)
411
+
412
+ # Save feature matrix
413
+ features_path = Path(self.config.output_directory) / f"features_{timestamp}.npy"
414
+ np.save(features_path, feature_matrix)
415
+
416
+ # Save agent graph insights if available
417
+ insights_path = None
418
+ if agent_result:
419
+ insights_path = Path(self.config.output_directory) / f"llm_insights_{timestamp}.json"
420
+ insights_data = {
421
+ "expert_summary": agent_result.get("expert_summary", ""),
422
+ "opportunities": agent_result.get("opportunities", []),
423
+ "threats": agent_result.get("threats", []),
424
+ "domain_insights": agent_result.get("domain_insights", []),
425
+ "processing_stats": agent_result.get("processing_stats", {})
426
+ }
427
+ with open(insights_path, "w", encoding="utf-8") as f:
428
+ json.dump(insights_data, f, indent=2, ensure_ascii=False)
429
+ logger.info(f"[DataTransformation] Saved LLM insights to {insights_path}")
430
+
431
+ # Language distribution
432
+ lang_dist = df["language"].value_counts().to_dict()
433
+
434
+ # Build report
435
+ report = {
436
+ "timestamp": timestamp,
437
+ "total_records": total_records,
438
+ "embedding_dim": embeddings.shape[1] if len(embeddings.shape) > 1 else 0,
439
+ "feature_dim": feature_matrix.shape[1],
440
+ "language_distribution": lang_dist,
441
+ "used_agent_graph": agent_result is not None,
442
+ "expert_summary_available": expert_summary is not None
443
+ }
444
+
445
+ artifact = DataTransformationArtifact(
446
+ transformed_data_path=str(transformed_path),
447
+ vector_embeddings_path=str(embeddings_path),
448
+ feature_store_path=str(features_path),
449
+ total_records=total_records,
450
+ language_distribution=lang_dist,
451
+ transformation_report=report
452
+ )
453
+
454
+ logger.info(f"[DataTransformation] ✓ Complete: {feature_matrix.shape}")
455
+ if agent_result:
456
+ logger.info(f"[DataTransformation] ✓ LLM Expert Summary: {len(expert_summary or '')} chars")
457
+ return artifact
458
+
models/anomaly-detection/src/components/data_validation.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/components/data_validation.py
3
+ Data validation component based on schema.yaml
4
+ """
5
+ import os
6
+ import yaml
7
+ import pandas as pd
8
+ import logging
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional, List, Dict, Any
12
+
13
+ from ..entity import DataValidationConfig, DataValidationArtifact
14
+
15
+ logger = logging.getLogger("data_validation")
16
+
17
+
18
+ class DataValidation:
19
+ """
20
+ Data validation component that validates feed data against schema.
21
+ Checks column types, required fields, and value constraints.
22
+ """
23
+
24
+ def __init__(self, config: Optional[DataValidationConfig] = None):
25
+ """
26
+ Initialize data validation component.
27
+
28
+ Args:
29
+ config: Optional configuration, uses defaults if None
30
+ """
31
+ self.config = config or DataValidationConfig()
32
+
33
+ # Ensure output directory exists
34
+ Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
35
+
36
+ # Load schema
37
+ self.schema = self._load_schema()
38
+
39
+ logger.info(f"[DataValidation] Initialized with schema: {self.config.schema_file}")
40
+
41
+ def _load_schema(self) -> Dict[str, Any]:
42
+ """Load schema from YAML file"""
43
+ if not os.path.exists(self.config.schema_file):
44
+ logger.warning(f"[DataValidation] Schema file not found: {self.config.schema_file}")
45
+ return {}
46
+
47
+ try:
48
+ with open(self.config.schema_file, 'r', encoding='utf-8') as f:
49
+ return yaml.safe_load(f)
50
+ except Exception as e:
51
+ logger.error(f"[DataValidation] Failed to load schema: {e}")
52
+ return {}
53
+
54
+ def _validate_required_columns(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
55
+ """
56
+ Check that all required columns are present.
57
+
58
+ Returns:
59
+ List of validation errors
60
+ """
61
+ errors = []
62
+
63
+ for col in self.config.required_columns:
64
+ if col not in df.columns:
65
+ errors.append({
66
+ "type": "missing_column",
67
+ "column": col,
68
+ "message": f"Required column '{col}' is missing"
69
+ })
70
+
71
+ return errors
72
+
73
+ def _validate_column_types(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
74
+ """
75
+ Validate column data types based on schema.
76
+
77
+ Returns:
78
+ List of validation errors
79
+ """
80
+ errors = []
81
+
82
+ if "feed_columns" not in self.schema:
83
+ return errors
84
+
85
+ for col_name, col_spec in self.schema["feed_columns"].items():
86
+ if col_name not in df.columns:
87
+ continue
88
+
89
+ expected_dtype = col_spec.get("dtype", "str")
90
+
91
+ # Check for null values in required columns
92
+ if col_spec.get("required", False):
93
+ null_count = df[col_name].isna().sum()
94
+ if null_count > 0:
95
+ errors.append({
96
+ "type": "null_values",
97
+ "column": col_name,
98
+ "count": int(null_count),
99
+ "message": f"Column '{col_name}' has {null_count} null values"
100
+ })
101
+
102
+ # Check min/max length for strings
103
+ if expected_dtype == "str" and col_name in df.columns:
104
+ min_len = col_spec.get("min_length", 0)
105
+ max_len = col_spec.get("max_length", float('inf'))
106
+
107
+ if min_len > 0:
108
+ short_count = (df[col_name].fillna("").str.len() < min_len).sum()
109
+ if short_count > 0:
110
+ errors.append({
111
+ "type": "min_length_violation",
112
+ "column": col_name,
113
+ "count": int(short_count),
114
+ "message": f"Column '{col_name}' has {short_count} values shorter than {min_len}"
115
+ })
116
+
117
+ # Check allowed values
118
+ allowed = col_spec.get("allowed_values")
119
+ if allowed and col_name in df.columns:
120
+ invalid_mask = ~df[col_name].isin(allowed) & df[col_name].notna()
121
+ invalid_count = invalid_mask.sum()
122
+ if invalid_count > 0:
123
+ errors.append({
124
+ "type": "invalid_value",
125
+ "column": col_name,
126
+ "count": int(invalid_count),
127
+ "allowed": allowed,
128
+ "message": f"Column '{col_name}' has {invalid_count} values not in allowed list"
129
+ })
130
+
131
+ return errors
132
+
133
+ def _validate_numeric_ranges(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
134
+ """
135
+ Validate numeric column ranges.
136
+
137
+ Returns:
138
+ List of validation errors
139
+ """
140
+ errors = []
141
+
142
+ if "feed_columns" not in self.schema:
143
+ return errors
144
+
145
+ for col_name, col_spec in self.schema["feed_columns"].items():
146
+ if col_name not in df.columns:
147
+ continue
148
+
149
+ expected_dtype = col_spec.get("dtype")
150
+
151
+ if expected_dtype in ["int", "float"]:
152
+ min_val = col_spec.get("min_value")
153
+ max_val = col_spec.get("max_value")
154
+
155
+ if min_val is not None:
156
+ try:
157
+ below_count = (pd.to_numeric(df[col_name], errors='coerce') < min_val).sum()
158
+ if below_count > 0:
159
+ errors.append({
160
+ "type": "below_minimum",
161
+ "column": col_name,
162
+ "count": int(below_count),
163
+ "min_value": min_val,
164
+ "message": f"Column '{col_name}' has {below_count} values below {min_val}"
165
+ })
166
+ except Exception:
167
+ pass
168
+
169
+ if max_val is not None:
170
+ try:
171
+ above_count = (pd.to_numeric(df[col_name], errors='coerce') > max_val).sum()
172
+ if above_count > 0:
173
+ errors.append({
174
+ "type": "above_maximum",
175
+ "column": col_name,
176
+ "count": int(above_count),
177
+ "max_value": max_val,
178
+ "message": f"Column '{col_name}' has {above_count} values above {max_val}"
179
+ })
180
+ except Exception:
181
+ pass
182
+
183
+ return errors
184
+
185
+ def validate(self, data_path: str) -> DataValidationArtifact:
186
+ """
187
+ Execute data validation pipeline.
188
+
189
+ Args:
190
+ data_path: Path to input data (parquet or csv)
191
+
192
+ Returns:
193
+ DataValidationArtifact with validation results
194
+ """
195
+ logger.info(f"[DataValidation] Validating: {data_path}")
196
+
197
+ # Load data
198
+ if data_path.endswith(".parquet"):
199
+ df = pd.read_parquet(data_path)
200
+ elif data_path.endswith(".csv"):
201
+ df = pd.read_csv(data_path)
202
+ else:
203
+ raise ValueError(f"Unsupported file format: {data_path}")
204
+
205
+ total_records = len(df)
206
+ logger.info(f"[DataValidation] Loaded {total_records} records")
207
+
208
+ # Run validations
209
+ all_errors = []
210
+ all_errors.extend(self._validate_required_columns(df))
211
+ all_errors.extend(self._validate_column_types(df))
212
+ all_errors.extend(self._validate_numeric_ranges(df))
213
+
214
+ # Calculate valid/invalid records
215
+ invalid_records = 0
216
+ for error in all_errors:
217
+ if "count" in error:
218
+ invalid_records = max(invalid_records, error["count"])
219
+
220
+ valid_records = total_records - invalid_records
221
+ validation_status = len(all_errors) == 0
222
+
223
+ # Log validation results
224
+ if validation_status:
225
+ logger.info("[DataValidation] ✓ All validations passed")
226
+ else:
227
+ logger.warning(f"[DataValidation] ⚠ Found {len(all_errors)} validation issues")
228
+ for error in all_errors[:5]: # Log first 5
229
+ logger.warning(f" - {error['message']}")
230
+
231
+ # Save validated data (even with warnings, we continue)
232
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
233
+ validated_path = Path(self.config.output_directory) / f"validated_data_{timestamp}.parquet"
234
+ df.to_parquet(validated_path, index=False)
235
+
236
+ # Save validation report
237
+ report_path = Path(self.config.output_directory) / f"validation_report_{timestamp}.yaml"
238
+ report = {
239
+ "validation_timestamp": timestamp,
240
+ "input_path": data_path,
241
+ "total_records": total_records,
242
+ "valid_records": valid_records,
243
+ "invalid_records": invalid_records,
244
+ "validation_status": validation_status,
245
+ "errors": all_errors
246
+ }
247
+ with open(report_path, 'w') as f:
248
+ yaml.dump(report, f, default_flow_style=False)
249
+
250
+ artifact = DataValidationArtifact(
251
+ validated_data_path=str(validated_path),
252
+ validation_report_path=str(report_path),
253
+ total_records=total_records,
254
+ valid_records=valid_records,
255
+ invalid_records=invalid_records,
256
+ validation_status=validation_status,
257
+ validation_errors=all_errors
258
+ )
259
+
260
+ logger.info(f"[DataValidation] ✓ Complete: {valid_records}/{total_records} valid records")
261
+ return artifact
models/anomaly-detection/src/components/model_trainer.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/components/model_trainer.py
3
+ Model training with Optuna hyperparameter tuning for clustering/anomaly detection
4
+ """
5
+ import os
6
+ import logging
7
+ import joblib
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Optional, Dict, Any, List
11
+ import numpy as np
12
+
13
+ from ..entity import ModelTrainerConfig, ModelTrainerArtifact
14
+ from ..utils import calculate_clustering_metrics, calculate_optuna_objective, format_metrics_report
15
+
16
+ logger = logging.getLogger("model_trainer")
17
+
18
+ # MLflow
19
+ try:
20
+ import mlflow
21
+ import mlflow.sklearn
22
+ MLFLOW_AVAILABLE = True
23
+ except ImportError:
24
+ MLFLOW_AVAILABLE = False
25
+ logger.warning("MLflow not available. Install with: pip install mlflow")
26
+
27
+ # Optuna
28
+ try:
29
+ import optuna
30
+ from optuna.samplers import TPESampler
31
+ OPTUNA_AVAILABLE = True
32
+ except ImportError:
33
+ OPTUNA_AVAILABLE = False
34
+ logger.warning("Optuna not available. Install with: pip install optuna")
35
+
36
+ # Clustering algorithms
37
+ try:
38
+ from sklearn.cluster import DBSCAN, KMeans
39
+ from sklearn.ensemble import IsolationForest
40
+ from sklearn.neighbors import LocalOutlierFactor
41
+ SKLEARN_AVAILABLE = True
42
+ except ImportError:
43
+ SKLEARN_AVAILABLE = False
44
+
45
+ try:
46
+ import hdbscan
47
+ HDBSCAN_AVAILABLE = True
48
+ except ImportError:
49
+ HDBSCAN_AVAILABLE = False
50
+ logger.warning("HDBSCAN not available. Install with: pip install hdbscan")
51
+
52
+
53
+ class ModelTrainer:
54
+ """
55
+ Model training component with:
56
+ 1. Optuna hyperparameter optimization
57
+ 2. Multiple clustering algorithms (DBSCAN, KMeans, HDBSCAN)
58
+ 3. Anomaly detection (Isolation Forest, LOF)
59
+ 4. MLflow experiment tracking
60
+ """
61
+
62
+ def __init__(self, config: Optional[ModelTrainerConfig] = None):
63
+ """
64
+ Initialize model trainer.
65
+
66
+ Args:
67
+ config: Optional configuration
68
+ """
69
+ self.config = config or ModelTrainerConfig()
70
+
71
+ # Ensure output directory exists
72
+ Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
73
+
74
+ # Setup MLflow
75
+ self._setup_mlflow()
76
+
77
+ logger.info(f"[ModelTrainer] Initialized")
78
+ logger.info(f" Models to train: {self.config.models_to_train}")
79
+ logger.info(f" Optuna trials: {self.config.n_optuna_trials}")
80
+
81
+ def _setup_mlflow(self):
82
+ """Configure MLflow tracking"""
83
+ if not MLFLOW_AVAILABLE:
84
+ logger.warning("[ModelTrainer] MLflow not available")
85
+ return
86
+
87
+ try:
88
+ # Set tracking URI
89
+ mlflow.set_tracking_uri(self.config.mlflow_tracking_uri)
90
+
91
+ # Set credentials for DagsHub
92
+ if self.config.mlflow_username and self.config.mlflow_password:
93
+ os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.mlflow_username
94
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.mlflow_password
95
+
96
+ # Create or get experiment
97
+ try:
98
+ mlflow.create_experiment(self.config.experiment_name)
99
+ except Exception:
100
+ pass
101
+ mlflow.set_experiment(self.config.experiment_name)
102
+
103
+ logger.info(f"[ModelTrainer] MLflow configured: {self.config.mlflow_tracking_uri}")
104
+
105
+ except Exception as e:
106
+ logger.warning(f"[ModelTrainer] MLflow setup error: {e}")
107
+
108
+ def _train_dbscan(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
109
+ """
110
+ Train DBSCAN with optional Optuna tuning.
111
+ """
112
+ if not SKLEARN_AVAILABLE:
113
+ return {"error": "sklearn not available"}
114
+
115
+ # Hyperparameters
116
+ if trial:
117
+ eps = trial.suggest_float("eps", 0.1, 2.0)
118
+ min_samples = trial.suggest_int("min_samples", 2, 20)
119
+ else:
120
+ eps = 0.5
121
+ min_samples = 5
122
+
123
+ model = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
124
+ labels = model.fit_predict(X)
125
+
126
+ metrics = calculate_clustering_metrics(X, labels)
127
+ metrics["eps"] = eps
128
+ metrics["min_samples"] = min_samples
129
+
130
+ return {
131
+ "model": model,
132
+ "labels": labels,
133
+ "metrics": metrics,
134
+ "params": {"eps": eps, "min_samples": min_samples}
135
+ }
136
+
137
+ def _train_kmeans(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
138
+ """
139
+ Train KMeans with optional Optuna tuning.
140
+ """
141
+ if not SKLEARN_AVAILABLE:
142
+ return {"error": "sklearn not available"}
143
+
144
+ # Hyperparameters
145
+ if trial:
146
+ n_clusters = trial.suggest_int("n_clusters", 2, 20)
147
+ n_init = trial.suggest_int("n_init", 5, 20)
148
+ else:
149
+ n_clusters = 5
150
+ n_init = 10
151
+
152
+ model = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=42)
153
+ labels = model.fit_predict(X)
154
+
155
+ metrics = calculate_clustering_metrics(X, labels)
156
+ metrics["n_clusters"] = n_clusters
157
+
158
+ return {
159
+ "model": model,
160
+ "labels": labels,
161
+ "metrics": metrics,
162
+ "params": {"n_clusters": n_clusters, "n_init": n_init}
163
+ }
164
+
165
+ def _train_hdbscan(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
166
+ """
167
+ Train HDBSCAN with optional Optuna tuning.
168
+ """
169
+ if not HDBSCAN_AVAILABLE:
170
+ return {"error": "hdbscan not available"}
171
+
172
+ # Hyperparameters
173
+ if trial:
174
+ min_cluster_size = trial.suggest_int("min_cluster_size", 5, 50)
175
+ min_samples = trial.suggest_int("min_samples", 1, 20)
176
+ else:
177
+ min_cluster_size = 15
178
+ min_samples = 5
179
+
180
+ model = hdbscan.HDBSCAN(
181
+ min_cluster_size=min_cluster_size,
182
+ min_samples=min_samples,
183
+ core_dist_n_jobs=-1
184
+ )
185
+ labels = model.fit_predict(X)
186
+
187
+ metrics = calculate_clustering_metrics(X, labels)
188
+
189
+ return {
190
+ "model": model,
191
+ "labels": labels,
192
+ "metrics": metrics,
193
+ "params": {"min_cluster_size": min_cluster_size, "min_samples": min_samples}
194
+ }
195
+
196
+ def _train_isolation_forest(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
197
+ """
198
+ Train Isolation Forest for anomaly detection.
199
+ """
200
+ if not SKLEARN_AVAILABLE:
201
+ return {"error": "sklearn not available"}
202
+
203
+ # Hyperparameters
204
+ if trial:
205
+ contamination = trial.suggest_float("contamination", 0.01, 0.3)
206
+ n_estimators = trial.suggest_int("n_estimators", 50, 200)
207
+ else:
208
+ contamination = 0.1
209
+ n_estimators = 100
210
+
211
+ model = IsolationForest(
212
+ contamination=contamination,
213
+ n_estimators=n_estimators,
214
+ random_state=42,
215
+ n_jobs=-1
216
+ )
217
+ predictions = model.fit_predict(X)
218
+ labels = (predictions == -1).astype(int) # -1 = anomaly
219
+
220
+ n_anomalies = int(np.sum(labels))
221
+
222
+ return {
223
+ "model": model,
224
+ "labels": labels,
225
+ "metrics": {
226
+ "n_anomalies": n_anomalies,
227
+ "anomaly_rate": n_anomalies / len(X),
228
+ "contamination": contamination,
229
+ "n_estimators": n_estimators
230
+ },
231
+ "params": {"contamination": contamination, "n_estimators": n_estimators},
232
+ "anomaly_indices": np.where(labels == 1)[0].tolist()
233
+ }
234
+
235
+ def _train_lof(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
236
+ """
237
+ Train Local Outlier Factor for anomaly detection.
238
+ """
239
+ if not SKLEARN_AVAILABLE:
240
+ return {"error": "sklearn not available"}
241
+
242
+ # Hyperparameters
243
+ if trial:
244
+ n_neighbors = trial.suggest_int("n_neighbors", 5, 50)
245
+ contamination = trial.suggest_float("contamination", 0.01, 0.3)
246
+ else:
247
+ n_neighbors = 20
248
+ contamination = 0.1
249
+
250
+ model = LocalOutlierFactor(
251
+ n_neighbors=n_neighbors,
252
+ contamination=contamination,
253
+ n_jobs=-1,
254
+ novelty=True # For prediction on new data
255
+ )
256
+ model.fit(X)
257
+ predictions = model.predict(X)
258
+ labels = (predictions == -1).astype(int) # -1 = anomaly
259
+
260
+ n_anomalies = int(np.sum(labels))
261
+
262
+ return {
263
+ "model": model,
264
+ "labels": labels,
265
+ "metrics": {
266
+ "n_anomalies": n_anomalies,
267
+ "anomaly_rate": n_anomalies / len(X),
268
+ "n_neighbors": n_neighbors,
269
+ "contamination": contamination
270
+ },
271
+ "params": {"n_neighbors": n_neighbors, "contamination": contamination},
272
+ "anomaly_indices": np.where(labels == 1)[0].tolist()
273
+ }
274
+
275
+ def _optimize_model(self, model_name: str, X: np.ndarray) -> Dict[str, Any]:
276
+ """
277
+ Use Optuna to find best hyperparameters for a model.
278
+ """
279
+ if not OPTUNA_AVAILABLE:
280
+ logger.warning("[ModelTrainer] Optuna not available, using defaults")
281
+ return self._train_model(model_name, X, None)
282
+
283
+ train_func = {
284
+ "dbscan": self._train_dbscan,
285
+ "kmeans": self._train_kmeans,
286
+ "hdbscan": self._train_hdbscan,
287
+ "isolation_forest": self._train_isolation_forest,
288
+ "lof": self._train_lof
289
+ }.get(model_name)
290
+
291
+ if not train_func:
292
+ return {"error": f"Unknown model: {model_name}"}
293
+
294
+ def objective(trial):
295
+ try:
296
+ result = train_func(X, trial)
297
+ if "error" in result:
298
+ return -1.0
299
+
300
+ metrics = result.get("metrics", {})
301
+
302
+ # For clustering: use silhouette
303
+ if model_name in ["dbscan", "kmeans", "hdbscan"]:
304
+ score = metrics.get("silhouette_score", -1)
305
+ return score if score is not None else -1
306
+
307
+ # For anomaly detection: balance anomaly rate
308
+ else:
309
+ # Target anomaly rate around 5-15%
310
+ rate = metrics.get("anomaly_rate", 0)
311
+ target = 0.1
312
+ return -abs(rate - target) # Closer to target is better
313
+
314
+ except Exception as e:
315
+ logger.debug(f"Trial failed: {e}")
316
+ return -1.0
317
+
318
+ # Create and run study
319
+ study = optuna.create_study(
320
+ direction="maximize",
321
+ sampler=TPESampler(seed=42)
322
+ )
323
+
324
+ study.optimize(
325
+ objective,
326
+ n_trials=self.config.n_optuna_trials,
327
+ timeout=self.config.optuna_timeout_seconds,
328
+ show_progress_bar=True
329
+ )
330
+
331
+ logger.info(f"[ModelTrainer] {model_name} best params: {study.best_params}")
332
+ logger.info(f"[ModelTrainer] {model_name} best score: {study.best_value:.4f}")
333
+
334
+ # Train with best params
335
+ best_result = train_func(X, None) # Use defaults as base
336
+ # Override with best params
337
+ if study.best_params:
338
+ # Re-train with best params would require custom logic
339
+ # For now, we just log the best params
340
+ best_result["best_params"] = study.best_params
341
+ best_result["best_score"] = study.best_value
342
+ best_result["study_name"] = study.study_name
343
+
344
+ return best_result
345
+
346
+ def _train_model(self, model_name: str, X: np.ndarray, trial=None) -> Dict[str, Any]:
347
+ """Train a single model"""
348
+ train_funcs = {
349
+ "dbscan": self._train_dbscan,
350
+ "kmeans": self._train_kmeans,
351
+ "hdbscan": self._train_hdbscan,
352
+ "isolation_forest": self._train_isolation_forest,
353
+ "lof": self._train_lof
354
+ }
355
+
356
+ func = train_funcs.get(model_name)
357
+ if func:
358
+ return func(X, trial)
359
+ return {"error": f"Unknown model: {model_name}"}
360
+
361
+ def train(self, feature_path: str) -> ModelTrainerArtifact:
362
+ """
363
+ Execute model training pipeline.
364
+
365
+ Args:
366
+ feature_path: Path to feature matrix (.npy)
367
+
368
+ Returns:
369
+ ModelTrainerArtifact with results
370
+ """
371
+ logger.info(f"[ModelTrainer] Starting training: {feature_path}")
372
+ start_time = datetime.now()
373
+
374
+ # Load features
375
+ X = np.load(feature_path)
376
+ logger.info(f"[ModelTrainer] Loaded features: {X.shape}")
377
+
378
+ # Start MLflow run
379
+ mlflow_run_id = ""
380
+ mlflow_experiment_id = ""
381
+
382
+ if MLFLOW_AVAILABLE:
383
+ try:
384
+ run = mlflow.start_run()
385
+ mlflow_run_id = run.info.run_id
386
+ mlflow_experiment_id = run.info.experiment_id
387
+
388
+ mlflow.log_param("n_samples", X.shape[0])
389
+ mlflow.log_param("n_features", X.shape[1])
390
+ mlflow.log_param("models", self.config.models_to_train)
391
+ except Exception as e:
392
+ logger.warning(f"[ModelTrainer] MLflow run start error: {e}")
393
+
394
+ # Train all models
395
+ trained_models = []
396
+ best_model = None
397
+ best_score = -float('inf')
398
+
399
+ for model_name in self.config.models_to_train:
400
+ logger.info(f"[ModelTrainer] Training {model_name}...")
401
+
402
+ try:
403
+ result = self._optimize_model(model_name, X)
404
+
405
+ if "error" in result:
406
+ logger.warning(f"[ModelTrainer] {model_name} error: {result['error']}")
407
+ continue
408
+
409
+ # Save model
410
+ model_path = Path(self.config.output_directory) / f"{model_name}_model.joblib"
411
+ joblib.dump(result["model"], model_path)
412
+
413
+ # Log to MLflow
414
+ if MLFLOW_AVAILABLE:
415
+ try:
416
+ mlflow.log_params({f"{model_name}_{k}": v for k, v in result.get("params", {}).items()})
417
+ mlflow.log_metrics({f"{model_name}_{k}": v for k, v in result.get("metrics", {}).items() if isinstance(v, (int, float))})
418
+ mlflow.sklearn.log_model(result["model"], model_name)
419
+ except Exception as e:
420
+ logger.debug(f"MLflow log error: {e}")
421
+
422
+ # Track results
423
+ model_info = {
424
+ "name": model_name,
425
+ "path": str(model_path),
426
+ "params": result.get("params", {}),
427
+ "metrics": result.get("metrics", {})
428
+ }
429
+ trained_models.append(model_info)
430
+
431
+ # Check if best (for clustering models)
432
+ score = result.get("metrics", {}).get("silhouette_score", -1)
433
+ if score and score > best_score:
434
+ best_score = score
435
+ best_model = model_info
436
+
437
+ logger.info(f"[ModelTrainer] ✓ {model_name} complete")
438
+
439
+ except Exception as e:
440
+ logger.error(f"[ModelTrainer] {model_name} failed: {e}")
441
+
442
+ # End MLflow run
443
+ if MLFLOW_AVAILABLE:
444
+ try:
445
+ mlflow.end_run()
446
+ except Exception:
447
+ pass
448
+
449
+ # Calculate duration
450
+ duration = (datetime.now() - start_time).total_seconds()
451
+
452
+ # Get anomaly info from best anomaly detector
453
+ n_anomalies = None
454
+ anomaly_indices = None
455
+ for model_info in trained_models:
456
+ if model_info["name"] in ["isolation_forest", "lof"]:
457
+ n_anomalies = model_info["metrics"].get("n_anomalies")
458
+ break
459
+
460
+ # Build artifact
461
+ artifact = ModelTrainerArtifact(
462
+ best_model_name=best_model["name"] if best_model else "",
463
+ best_model_path=best_model["path"] if best_model else "",
464
+ best_model_metrics=best_model["metrics"] if best_model else {},
465
+ trained_models=trained_models,
466
+ mlflow_run_id=mlflow_run_id,
467
+ mlflow_experiment_id=mlflow_experiment_id,
468
+ n_clusters=best_model["metrics"].get("n_clusters") if best_model else None,
469
+ n_anomalies=n_anomalies,
470
+ anomaly_indices=anomaly_indices,
471
+ training_duration_seconds=duration,
472
+ optuna_study_name=None
473
+ )
474
+
475
+ logger.info(f"[ModelTrainer] ✓ Training complete in {duration:.1f}s")
476
+ logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
477
+
478
+ return artifact
models/anomaly-detection/src/entity/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/entity/__init__.py
3
+ """
4
+ from .config_entity import (
5
+ DataIngestionConfig,
6
+ DataValidationConfig,
7
+ DataTransformationConfig,
8
+ ModelTrainerConfig,
9
+ PipelineConfig
10
+ )
11
+ from .artifact_entity import (
12
+ DataIngestionArtifact,
13
+ DataValidationArtifact,
14
+ DataTransformationArtifact,
15
+ ModelTrainerArtifact,
16
+ PipelineArtifact
17
+ )
18
+
19
+ __all__ = [
20
+ "DataIngestionConfig",
21
+ "DataValidationConfig",
22
+ "DataTransformationConfig",
23
+ "ModelTrainerConfig",
24
+ "PipelineConfig",
25
+ "DataIngestionArtifact",
26
+ "DataValidationArtifact",
27
+ "DataTransformationArtifact",
28
+ "ModelTrainerArtifact",
29
+ "PipelineArtifact"
30
+ ]
models/anomaly-detection/src/entity/artifact_entity.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/entity/artifact_entity.py
3
+ Artifact entities for pipeline outputs
4
+ """
5
+ from dataclasses import dataclass
6
+ from typing import List, Dict, Any, Optional
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass
11
+ class DataIngestionArtifact:
12
+ """Artifact from data ingestion step"""
13
+ raw_data_path: str
14
+ total_records: int
15
+ records_from_sqlite: int
16
+ records_from_csv: int
17
+ ingestion_timestamp: str
18
+ is_data_available: bool
19
+
20
+
21
+ @dataclass
22
+ class DataValidationArtifact:
23
+ """Artifact from data validation step"""
24
+ validated_data_path: str
25
+ validation_report_path: str
26
+ total_records: int
27
+ valid_records: int
28
+ invalid_records: int
29
+ validation_status: bool
30
+ validation_errors: List[Dict[str, Any]]
31
+
32
+
33
+ @dataclass
34
+ class DataTransformationArtifact:
35
+ """Artifact from data transformation step"""
36
+ transformed_data_path: str
37
+ vector_embeddings_path: str
38
+ feature_store_path: str
39
+ total_records: int
40
+ language_distribution: Dict[str, int]
41
+ transformation_report: Dict[str, Any]
42
+
43
+
44
+ @dataclass
45
+ class ModelTrainerArtifact:
46
+ """Artifact from model training step"""
47
+ # Best model info
48
+ best_model_name: str
49
+ best_model_path: str
50
+ best_model_metrics: Dict[str, float]
51
+
52
+ # All trained models
53
+ trained_models: List[Dict[str, Any]]
54
+
55
+ # MLflow tracking
56
+ mlflow_run_id: str
57
+ mlflow_experiment_id: str
58
+
59
+ # Cluster/anomaly results
60
+ n_clusters: Optional[int]
61
+ n_anomalies: Optional[int]
62
+ anomaly_indices: Optional[List[int]]
63
+
64
+ # Training info
65
+ training_duration_seconds: float
66
+ optuna_study_name: Optional[str]
67
+
68
+
69
+ @dataclass
70
+ class PipelineArtifact:
71
+ """Complete pipeline artifact"""
72
+ data_ingestion: DataIngestionArtifact
73
+ data_validation: DataValidationArtifact
74
+ data_transformation: DataTransformationArtifact
75
+ model_trainer: ModelTrainerArtifact
76
+ pipeline_run_id: str
77
+ pipeline_start_time: str
78
+ pipeline_end_time: str
79
+ pipeline_status: str
models/anomaly-detection/src/entity/config_entity.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/entity/config_entity.py
3
+ Configuration entities for the anomaly detection pipeline
4
+ """
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import List, Optional
8
+ import os
9
+
10
+
11
+ @dataclass
12
+ class DataIngestionConfig:
13
+ """Configuration for data ingestion component"""
14
+ sqlite_db_path: str = field(default_factory=lambda: os.getenv(
15
+ "SQLITE_DB_PATH",
16
+ str(Path(__file__).parent.parent.parent.parent.parent / "data" / "feeds" / "feed_cache.db")
17
+ ))
18
+ csv_directory: str = field(default_factory=lambda: str(
19
+ Path(__file__).parent.parent.parent.parent.parent / "datasets" / "political_feeds"
20
+ ))
21
+ output_directory: str = field(default_factory=lambda: str(
22
+ Path(__file__).parent.parent.parent / "artifacts" / "data_ingestion"
23
+ ))
24
+ batch_size: int = 1000
25
+ min_text_length: int = 10
26
+
27
+
28
+ @dataclass
29
+ class DataValidationConfig:
30
+ """Configuration for data validation component"""
31
+ schema_file: str = field(default_factory=lambda: str(
32
+ Path(__file__).parent.parent.parent / "data_schema" / "schema.yaml"
33
+ ))
34
+ required_columns: List[str] = field(default_factory=lambda: [
35
+ "post_id", "timestamp", "platform", "category", "text", "content_hash"
36
+ ])
37
+ output_directory: str = field(default_factory=lambda: str(
38
+ Path(__file__).parent.parent.parent / "artifacts" / "data_validation"
39
+ ))
40
+
41
+
42
+ @dataclass
43
+ class DataTransformationConfig:
44
+ """Configuration for data transformation/vectorization component"""
45
+ # Huggingface models - will be downloaded locally
46
+ models_cache_dir: str = field(default_factory=lambda: str(
47
+ Path(__file__).parent.parent.parent / "models_cache"
48
+ ))
49
+
50
+ # Language-specific BERT models
51
+ english_model: str = "distilbert-base-uncased"
52
+ sinhala_model: str = "keshan/SinhalaBERTo"
53
+ tamil_model: str = "l3cube-pune/tamil-bert"
54
+
55
+ # Language detection
56
+ fasttext_model_path: str = field(default_factory=lambda: str(
57
+ Path(__file__).parent.parent.parent / "models_cache" / "lid.176.bin" # FastText language ID model
58
+ ))
59
+
60
+ # Vector dimensions
61
+ vector_dim: int = 768 # Standard BERT dimension
62
+
63
+ # Output
64
+ output_directory: str = field(default_factory=lambda: str(
65
+ Path(__file__).parent.parent.parent / "artifacts" / "data_transformation"
66
+ ))
67
+
68
+
69
+ @dataclass
70
+ class ModelTrainerConfig:
71
+ """Configuration for model training component"""
72
+ # MLflow configuration
73
+ mlflow_tracking_uri: str = field(default_factory=lambda: os.getenv(
74
+ "MLFLOW_TRACKING_URI", "https://dagshub.com/sliitguy/SecurityNetwork.mlflow"
75
+ ))
76
+ mlflow_username: str = field(default_factory=lambda: os.getenv(
77
+ "MLFLOW_TRACKING_USERNAME", ""
78
+ ))
79
+ mlflow_password: str = field(default_factory=lambda: os.getenv(
80
+ "MLFLOW_TRACKING_PASSWORD", ""
81
+ ))
82
+ experiment_name: str = "anomaly_detection_feeds"
83
+
84
+ # Model configurations
85
+ models_to_train: List[str] = field(default_factory=lambda: [
86
+ "dbscan", "kmeans", "hdbscan", "isolation_forest", "lof"
87
+ ])
88
+
89
+ # Optuna hyperparameter tuning
90
+ n_optuna_trials: int = 50
91
+ optuna_timeout_seconds: int = 3600 # 1 hour
92
+
93
+ # Model output
94
+ output_directory: str = field(default_factory=lambda: str(
95
+ Path(__file__).parent.parent.parent / "artifacts" / "model_trainer"
96
+ ))
97
+
98
+
99
+ @dataclass
100
+ class PipelineConfig:
101
+ """Master configuration for the entire pipeline"""
102
+ data_ingestion: DataIngestionConfig = field(default_factory=DataIngestionConfig)
103
+ data_validation: DataValidationConfig = field(default_factory=DataValidationConfig)
104
+ data_transformation: DataTransformationConfig = field(default_factory=DataTransformationConfig)
105
+ model_trainer: ModelTrainerConfig = field(default_factory=ModelTrainerConfig)
106
+
107
+ # Pipeline settings
108
+ batch_threshold: int = 1000 # Trigger training after this many new records
109
+ run_interval_hours: int = 24 # Fallback daily run
models/anomaly-detection/src/pipeline/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/pipeline/__init__.py
3
+ """
4
+ from .training_pipeline import TrainingPipeline, run_training_pipeline
5
+
6
+ __all__ = ["TrainingPipeline", "run_training_pipeline"]
models/anomaly-detection/src/pipeline/training_pipeline.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/pipeline/training_pipeline.py
3
+ End-to-end training pipeline orchestrator
4
+ """
5
+ import logging
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from ..entity import (
11
+ PipelineConfig,
12
+ PipelineArtifact,
13
+ DataIngestionArtifact,
14
+ DataValidationArtifact,
15
+ DataTransformationArtifact,
16
+ ModelTrainerArtifact
17
+ )
18
+ from ..components import (
19
+ DataIngestion,
20
+ DataValidation,
21
+ DataTransformation,
22
+ ModelTrainer
23
+ )
24
+
25
+ logger = logging.getLogger("training_pipeline")
26
+
27
+
28
+ class TrainingPipeline:
29
+ """
30
+ End-to-end training pipeline that orchestrates:
31
+ 1. Data Ingestion (SQLite + CSV)
32
+ 2. Data Validation (schema checking)
33
+ 3. Data Transformation (language detection + vectorization)
34
+ 4. Model Training (clustering + anomaly detection)
35
+ """
36
+
37
+ def __init__(self, config: Optional[PipelineConfig] = None):
38
+ """
39
+ Initialize training pipeline.
40
+
41
+ Args:
42
+ config: Optional pipeline configuration
43
+ """
44
+ self.config = config or PipelineConfig()
45
+ self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
46
+
47
+ logger.info(f"[TrainingPipeline] Initialized (run_id: {self.run_id})")
48
+
49
+ def run_data_ingestion(self) -> DataIngestionArtifact:
50
+ """Execute data ingestion step"""
51
+ logger.info("=" * 50)
52
+ logger.info("[TrainingPipeline] STEP 1: Data Ingestion")
53
+ logger.info("=" * 50)
54
+
55
+ ingestion = DataIngestion(self.config.data_ingestion)
56
+ artifact = ingestion.ingest()
57
+
58
+ if not artifact.is_data_available:
59
+ raise ValueError("No data available for training")
60
+
61
+ return artifact
62
+
63
+ def run_data_validation(self, ingestion_artifact: DataIngestionArtifact) -> DataValidationArtifact:
64
+ """Execute data validation step"""
65
+ logger.info("=" * 50)
66
+ logger.info("[TrainingPipeline] STEP 2: Data Validation")
67
+ logger.info("=" * 50)
68
+
69
+ validation = DataValidation(self.config.data_validation)
70
+ artifact = validation.validate(ingestion_artifact.raw_data_path)
71
+
72
+ return artifact
73
+
74
+ def run_data_transformation(self, validation_artifact: DataValidationArtifact) -> DataTransformationArtifact:
75
+ """Execute data transformation step"""
76
+ logger.info("=" * 50)
77
+ logger.info("[TrainingPipeline] STEP 3: Data Transformation")
78
+ logger.info("=" * 50)
79
+
80
+ transformation = DataTransformation(self.config.data_transformation)
81
+ artifact = transformation.transform(validation_artifact.validated_data_path)
82
+
83
+ return artifact
84
+
85
+ def run_model_training(self, transformation_artifact: DataTransformationArtifact) -> ModelTrainerArtifact:
86
+ """Execute model training step"""
87
+ logger.info("=" * 50)
88
+ logger.info("[TrainingPipeline] STEP 4: Model Training")
89
+ logger.info("=" * 50)
90
+
91
+ trainer = ModelTrainer(self.config.model_trainer)
92
+ artifact = trainer.train(transformation_artifact.feature_store_path)
93
+
94
+ return artifact
95
+
96
+ def run(self) -> PipelineArtifact:
97
+ """
98
+ Execute the complete training pipeline.
99
+
100
+ Returns:
101
+ PipelineArtifact with all step results
102
+ """
103
+ start_time = datetime.now()
104
+ logger.info("=" * 60)
105
+ logger.info("[TrainingPipeline] STARTING TRAINING PIPELINE")
106
+ logger.info("=" * 60)
107
+
108
+ try:
109
+ # Step 1: Data Ingestion
110
+ ingestion_artifact = self.run_data_ingestion()
111
+
112
+ # Step 2: Data Validation
113
+ validation_artifact = self.run_data_validation(ingestion_artifact)
114
+
115
+ # Step 3: Data Transformation
116
+ transformation_artifact = self.run_data_transformation(validation_artifact)
117
+
118
+ # Step 4: Model Training
119
+ training_artifact = self.run_model_training(transformation_artifact)
120
+
121
+ pipeline_status = "SUCCESS"
122
+
123
+ except Exception as e:
124
+ logger.error(f"[TrainingPipeline] Pipeline failed: {e}")
125
+ pipeline_status = f"FAILED: {str(e)}"
126
+ raise
127
+
128
+ finally:
129
+ end_time = datetime.now()
130
+ duration = (end_time - start_time).total_seconds()
131
+ logger.info("=" * 60)
132
+ logger.info(f"[TrainingPipeline] PIPELINE {pipeline_status}")
133
+ logger.info(f"[TrainingPipeline] Duration: {duration:.1f}s")
134
+ logger.info("=" * 60)
135
+
136
+ # Build final artifact
137
+ artifact = PipelineArtifact(
138
+ data_ingestion=ingestion_artifact,
139
+ data_validation=validation_artifact,
140
+ data_transformation=transformation_artifact,
141
+ model_trainer=training_artifact,
142
+ pipeline_run_id=self.run_id,
143
+ pipeline_start_time=start_time.isoformat(),
144
+ pipeline_end_time=end_time.isoformat(),
145
+ pipeline_status=pipeline_status
146
+ )
147
+
148
+ return artifact
149
+
150
+
151
+ def run_training_pipeline(config: Optional[PipelineConfig] = None) -> PipelineArtifact:
152
+ """
153
+ Convenience function to run the training pipeline.
154
+
155
+ Args:
156
+ config: Optional pipeline configuration
157
+
158
+ Returns:
159
+ PipelineArtifact with results
160
+ """
161
+ pipeline = TrainingPipeline(config)
162
+ return pipeline.run()
models/anomaly-detection/src/utils/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/utils/__init__.py
3
+ """
4
+ from .language_detector import LanguageDetector, detect_language, get_detector
5
+ from .vectorizer import MultilingualVectorizer, vectorize_text, get_vectorizer
6
+ from .metrics import (
7
+ calculate_clustering_metrics,
8
+ calculate_anomaly_metrics,
9
+ calculate_optuna_objective,
10
+ format_metrics_report
11
+ )
12
+
13
+ __all__ = [
14
+ "LanguageDetector",
15
+ "detect_language",
16
+ "get_detector",
17
+ "MultilingualVectorizer",
18
+ "vectorize_text",
19
+ "get_vectorizer",
20
+ "calculate_clustering_metrics",
21
+ "calculate_anomaly_metrics",
22
+ "calculate_optuna_objective",
23
+ "format_metrics_report"
24
+ ]
models/anomaly-detection/src/utils/language_detector.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/utils/language_detector.py
3
+ Language detection using FastText or lingua-py for Sinhala/Tamil/English
4
+ """
5
+ import os
6
+ import logging
7
+ from typing import Tuple, Optional
8
+ from pathlib import Path
9
+ import re
10
+
11
+ logger = logging.getLogger("language_detector")
12
+
13
+ # Try FastText first, fallback to lingua
14
+ try:
15
+ import fasttext
16
+ fasttext.FastText.eprint = lambda x: None # Suppress warnings
17
+ FASTTEXT_AVAILABLE = True
18
+ except ImportError:
19
+ FASTTEXT_AVAILABLE = False
20
+ logger.warning("FastText not available. Install with: pip install fasttext")
21
+
22
+ try:
23
+ from lingua import Language, LanguageDetectorBuilder
24
+ LINGUA_AVAILABLE = True
25
+ except ImportError:
26
+ LINGUA_AVAILABLE = False
27
+ logger.warning("Lingua not available. Install with: pip install lingua-language-detector")
28
+
29
+
30
+ class LanguageDetector:
31
+ """
32
+ Multilingual language detector supporting Sinhala, Tamil, and English.
33
+ Uses FastText as primary detector with lingua fallback.
34
+ """
35
+
36
+ # Language code mapping
37
+ LANG_MAP = {
38
+ "en": "english",
39
+ "si": "sinhala",
40
+ "ta": "tamil",
41
+ "__label__en": "english",
42
+ "__label__si": "sinhala",
43
+ "__label__ta": "tamil",
44
+ "ENGLISH": "english",
45
+ "SINHALA": "sinhala",
46
+ "TAMIL": "tamil"
47
+ }
48
+
49
+ # Unicode ranges for script detection
50
+ SINHALA_RANGE = (0x0D80, 0x0DFF)
51
+ TAMIL_RANGE = (0x0B80, 0x0BFF)
52
+
53
+ def __init__(self, models_cache_dir: Optional[str] = None):
54
+ """
55
+ Initialize language detector.
56
+
57
+ Args:
58
+ models_cache_dir: Directory for cached FastText models
59
+ """
60
+ self.models_cache_dir = models_cache_dir or str(
61
+ Path(__file__).parent.parent.parent / "models_cache"
62
+ )
63
+ Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
64
+
65
+ self.fasttext_model = None
66
+ self.lingua_detector = None
67
+
68
+ self._init_detectors()
69
+
70
+ def _init_detectors(self):
71
+ """Initialize detection models"""
72
+ # Try FastText
73
+ if FASTTEXT_AVAILABLE:
74
+ model_path = Path(self.models_cache_dir) / "lid.176.bin"
75
+ if model_path.exists():
76
+ try:
77
+ self.fasttext_model = fasttext.load_model(str(model_path))
78
+ logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
79
+ except Exception as e:
80
+ logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
81
+ else:
82
+ logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
83
+ logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")
84
+
85
+ # Initialize lingua as fallback
86
+ if LINGUA_AVAILABLE:
87
+ try:
88
+ self.lingua_detector = LanguageDetectorBuilder.from_languages(
89
+ Language.ENGLISH,
90
+ Language.TAMIL,
91
+ # Note: Lingua may not have Sinhala, we'll use script detection
92
+ ).build()
93
+ logger.info("[LanguageDetector] Initialized Lingua detector")
94
+ except Exception as e:
95
+ logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")
96
+
97
+ def _detect_by_script(self, text: str) -> Optional[str]:
98
+ """
99
+ Detect language by Unicode script analysis.
100
+ More reliable for Sinhala/Tamil which have distinct scripts.
101
+ """
102
+ sinhala_count = 0
103
+ tamil_count = 0
104
+ latin_count = 0
105
+
106
+ for char in text:
107
+ code = ord(char)
108
+ if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
109
+ sinhala_count += 1
110
+ elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
111
+ tamil_count += 1
112
+ elif char.isalpha() and code < 128:
113
+ latin_count += 1
114
+
115
+ total_alpha = sinhala_count + tamil_count + latin_count
116
+ if total_alpha == 0:
117
+ return None
118
+
119
+ # Threshold-based detection
120
+ if sinhala_count / total_alpha > 0.3:
121
+ return "sinhala"
122
+ if tamil_count / total_alpha > 0.3:
123
+ return "tamil"
124
+ if latin_count / total_alpha > 0.5:
125
+ return "english"
126
+
127
+ return None
128
+
129
+ def detect(self, text: str) -> Tuple[str, float]:
130
+ """
131
+ Detect language of text.
132
+
133
+ Args:
134
+ text: Input text
135
+
136
+ Returns:
137
+ Tuple of (language_code, confidence)
138
+ language_code: 'english', 'sinhala', 'tamil', or 'unknown'
139
+ """
140
+ if not text or len(text.strip()) < 3:
141
+ return "unknown", 0.0
142
+
143
+ # Clean text
144
+ clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text)
145
+ clean_text = clean_text.strip()
146
+
147
+ if not clean_text:
148
+ return "unknown", 0.0
149
+
150
+ # 1. First try script detection (most reliable for Sinhala/Tamil)
151
+ script_lang = self._detect_by_script(clean_text)
152
+ if script_lang in ["sinhala", "tamil"]:
153
+ return script_lang, 0.95
154
+
155
+ # 2. Try FastText
156
+ if self.fasttext_model:
157
+ try:
158
+ predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
159
+ label = predictions[0][0]
160
+ confidence = predictions[1][0]
161
+
162
+ lang = self.LANG_MAP.get(label, "unknown")
163
+ if lang != "unknown" and confidence > 0.5:
164
+ return lang, float(confidence)
165
+ except Exception as e:
166
+ logger.debug(f"FastText error: {e}")
167
+
168
+ # 3. Try Lingua
169
+ if self.lingua_detector:
170
+ try:
171
+ detected = self.lingua_detector.detect_language_of(clean_text)
172
+ if detected:
173
+ lang = self.LANG_MAP.get(detected.name, "unknown")
174
+ # Lingua doesn't return confidence, estimate based on text
175
+ confidence = 0.8 if len(clean_text) > 20 else 0.6
176
+ return lang, confidence
177
+ except Exception as e:
178
+ logger.debug(f"Lingua error: {e}")
179
+
180
+ # 4. Fallback to script detection result or default
181
+ if script_lang == "english":
182
+ return "english", 0.7
183
+
184
+ return "english", 0.5 # Default to English
185
+
186
+
187
+ # Singleton instance
188
+ _detector: Optional[LanguageDetector] = None
189
+
190
+
191
+ def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
192
+ """Get or create singleton detector instance"""
193
+ global _detector
194
+ if _detector is None:
195
+ _detector = LanguageDetector(models_cache_dir)
196
+ return _detector
197
+
198
+
199
+ def detect_language(text: str) -> Tuple[str, float]:
200
+ """
201
+ Convenience function for language detection.
202
+
203
+ Args:
204
+ text: Input text
205
+
206
+ Returns:
207
+ Tuple of (language: str, confidence: float)
208
+ """
209
+ return get_detector().detect(text)
models/anomaly-detection/src/utils/metrics.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/utils/metrics.py
3
+ Clustering and anomaly detection metrics for model evaluation
4
+ """
5
+ import numpy as np
6
+ from typing import Dict, Any, Optional, List
7
+ import logging
8
+
9
+ logger = logging.getLogger("metrics")
10
+
11
+ # Scikit-learn metrics
12
+ try:
13
+ from sklearn.metrics import (
14
+ silhouette_score,
15
+ calinski_harabasz_score,
16
+ davies_bouldin_score,
17
+ adjusted_rand_score,
18
+ normalized_mutual_info_score
19
+ )
20
+ SKLEARN_AVAILABLE = True
21
+ except ImportError:
22
+ SKLEARN_AVAILABLE = False
23
+ logger.warning("scikit-learn not available for metrics")
24
+
25
+
26
+ def calculate_clustering_metrics(
27
+ X: np.ndarray,
28
+ labels: np.ndarray,
29
+ true_labels: Optional[np.ndarray] = None
30
+ ) -> Dict[str, float]:
31
+ """
32
+ Calculate comprehensive clustering quality metrics.
33
+
34
+ Args:
35
+ X: Feature matrix (n_samples, n_features)
36
+ labels: Predicted cluster labels
37
+ true_labels: Optional ground truth labels for supervised metrics
38
+
39
+ Returns:
40
+ Dict of metric_name -> metric_value
41
+ """
42
+ if not SKLEARN_AVAILABLE:
43
+ logger.warning("sklearn not available, returning empty metrics")
44
+ return {}
45
+
46
+ metrics = {}
47
+
48
+ # Filter out noise points (label=-1) for some metrics
49
+ valid_mask = labels >= 0
50
+ n_clusters = len(set(labels[valid_mask]))
51
+
52
+ # Need at least 2 clusters and >1 samples for metrics
53
+ if n_clusters < 2 or np.sum(valid_mask) < 2:
54
+ metrics["n_clusters"] = n_clusters
55
+ metrics["n_noise_points"] = np.sum(labels == -1)
56
+ metrics["error"] = "insufficient_clusters"
57
+ return metrics
58
+
59
+ # Internal metrics (don't need ground truth)
60
+ try:
61
+ # Silhouette Score: -1 (bad) to 1 (good)
62
+ # Measures how similar objects are to their own cluster vs other clusters
63
+ metrics["silhouette_score"] = float(silhouette_score(
64
+ X[valid_mask], labels[valid_mask]
65
+ ))
66
+ except Exception as e:
67
+ logger.debug(f"Silhouette score failed: {e}")
68
+ metrics["silhouette_score"] = None
69
+
70
+ try:
71
+ # Calinski-Harabasz Index: Higher is better
72
+ # Ratio of between-cluster dispersion to within-cluster dispersion
73
+ metrics["calinski_harabasz_score"] = float(calinski_harabasz_score(
74
+ X[valid_mask], labels[valid_mask]
75
+ ))
76
+ except Exception as e:
77
+ logger.debug(f"Calinski-Harabasz failed: {e}")
78
+ metrics["calinski_harabasz_score"] = None
79
+
80
+ try:
81
+ # Davies-Bouldin Index: Lower is better
82
+ # Average similarity between clusters
83
+ metrics["davies_bouldin_score"] = float(davies_bouldin_score(
84
+ X[valid_mask], labels[valid_mask]
85
+ ))
86
+ except Exception as e:
87
+ logger.debug(f"Davies-Bouldin failed: {e}")
88
+ metrics["davies_bouldin_score"] = None
89
+
90
+ # Cluster statistics
91
+ metrics["n_clusters"] = n_clusters
92
+ metrics["n_samples"] = len(labels)
93
+ metrics["n_noise_points"] = int(np.sum(labels == -1))
94
+ metrics["noise_ratio"] = float(np.sum(labels == -1) / len(labels))
95
+
96
+ # Cluster size statistics
97
+ cluster_sizes = [np.sum(labels == i) for i in range(n_clusters)]
98
+ metrics["min_cluster_size"] = int(min(cluster_sizes)) if cluster_sizes else 0
99
+ metrics["max_cluster_size"] = int(max(cluster_sizes)) if cluster_sizes else 0
100
+ metrics["mean_cluster_size"] = float(np.mean(cluster_sizes)) if cluster_sizes else 0
101
+
102
+ # External metrics (if ground truth provided)
103
+ if true_labels is not None:
104
+ try:
105
+ # Adjusted Rand Index: -1 to 1, 1=perfect, 0=random
106
+ metrics["adjusted_rand_score"] = float(adjusted_rand_score(
107
+ true_labels, labels
108
+ ))
109
+ except Exception as e:
110
+ logger.debug(f"ARI failed: {e}")
111
+
112
+ try:
113
+ # Normalized Mutual Information: 0 to 1, 1=perfect agreement
114
+ metrics["normalized_mutual_info"] = float(normalized_mutual_info_score(
115
+ true_labels, labels
116
+ ))
117
+ except Exception as e:
118
+ logger.debug(f"NMI failed: {e}")
119
+
120
+ return metrics
121
+
122
+
123
+ def calculate_anomaly_metrics(
124
+ labels: np.ndarray,
125
+ predicted_anomalies: np.ndarray,
126
+ true_anomalies: Optional[np.ndarray] = None
127
+ ) -> Dict[str, float]:
128
+ """
129
+ Calculate anomaly detection metrics.
130
+
131
+ Args:
132
+ labels: Cluster labels or -1 for anomalies
133
+ predicted_anomalies: Boolean array of predicted anomaly flags
134
+ true_anomalies: Optional ground truth anomaly flags
135
+
136
+ Returns:
137
+ Dict of metric_name -> metric_value
138
+ """
139
+ metrics = {}
140
+
141
+ n_samples = len(labels)
142
+ n_predicted_anomalies = int(np.sum(predicted_anomalies))
143
+
144
+ metrics["n_samples"] = n_samples
145
+ metrics["n_predicted_anomalies"] = n_predicted_anomalies
146
+ metrics["anomaly_rate"] = float(n_predicted_anomalies / n_samples) if n_samples > 0 else 0
147
+
148
+ # If ground truth available, calculate precision/recall
149
+ if true_anomalies is not None:
150
+ n_true_anomalies = int(np.sum(true_anomalies))
151
+
152
+ # True positives: predicted AND actual anomalies
153
+ tp = int(np.sum(predicted_anomalies & true_anomalies))
154
+ # False positives: predicted anomaly but not actual
155
+ fp = int(np.sum(predicted_anomalies & ~true_anomalies))
156
+ # False negatives: not predicted but actual anomaly
157
+ fn = int(np.sum(~predicted_anomalies & true_anomalies))
158
+ # True negatives
159
+ tn = int(np.sum(~predicted_anomalies & ~true_anomalies))
160
+
161
+ metrics["true_positives"] = tp
162
+ metrics["false_positives"] = fp
163
+ metrics["false_negatives"] = fn
164
+ metrics["true_negatives"] = tn
165
+
166
+ # Precision: TP / (TP + FP)
167
+ metrics["precision"] = float(tp / (tp + fp)) if (tp + fp) > 0 else 0
168
+
169
+ # Recall: TP / (TP + FN)
170
+ metrics["recall"] = float(tp / (tp + fn)) if (tp + fn) > 0 else 0
171
+
172
+ # F1 Score
173
+ if metrics["precision"] + metrics["recall"] > 0:
174
+ metrics["f1_score"] = float(
175
+ 2 * metrics["precision"] * metrics["recall"] /
176
+ (metrics["precision"] + metrics["recall"])
177
+ )
178
+ else:
179
+ metrics["f1_score"] = 0
180
+
181
+ return metrics
182
+
183
+
184
+ def calculate_optuna_objective(
185
+ X: np.ndarray,
186
+ labels: np.ndarray,
187
+ objective_type: str = "silhouette"
188
+ ) -> float:
189
+ """
190
+ Calculate objective value for Optuna optimization.
191
+
192
+ Args:
193
+ X: Feature matrix
194
+ labels: Predicted labels
195
+ objective_type: 'silhouette', 'calinski', or 'combined'
196
+
197
+ Returns:
198
+ Objective value (higher is better)
199
+ """
200
+ metrics = calculate_clustering_metrics(X, labels)
201
+
202
+ # Check for errors
203
+ if "error" in metrics:
204
+ return -1.0 # Return bad score for failed clustering
205
+
206
+ if objective_type == "silhouette":
207
+ score = metrics.get("silhouette_score")
208
+ return score if score is not None else -1.0
209
+
210
+ elif objective_type == "calinski":
211
+ score = metrics.get("calinski_harabasz_score")
212
+ # Normalize to 0-1 range (approximate)
213
+ return min(score / 1000, 1.0) if score is not None else -1.0
214
+
215
+ elif objective_type == "combined":
216
+ # Weighted combination of metrics
217
+ silhouette = metrics.get("silhouette_score", -1)
218
+ calinski = min(metrics.get("calinski_harabasz_score", 0) / 1000, 1)
219
+ davies = metrics.get("davies_bouldin_score", 10)
220
+
221
+ # Davies-Bouldin is lower=better, invert it
222
+ davies_inv = 1 / (1 + davies) if davies is not None else 0
223
+
224
+ # Weighted combination
225
+ combined = (0.4 * silhouette + 0.3 * calinski + 0.3 * davies_inv)
226
+ return float(combined)
227
+
228
+ return -1.0
229
+
230
+
231
+ def format_metrics_report(metrics: Dict[str, Any]) -> str:
232
+ """
233
+ Format metrics dictionary as a readable report.
234
+
235
+ Args:
236
+ metrics: Dictionary of metric values
237
+
238
+ Returns:
239
+ Formatted string report
240
+ """
241
+ lines = ["=" * 50]
242
+ lines.append("CLUSTERING METRICS REPORT")
243
+ lines.append("=" * 50)
244
+
245
+ for key, value in metrics.items():
246
+ if value is None:
247
+ value_str = "N/A"
248
+ elif isinstance(value, float):
249
+ value_str = f"{value:.4f}"
250
+ else:
251
+ value_str = str(value)
252
+
253
+ lines.append(f"{key:30s}: {value_str}")
254
+
255
+ lines.append("=" * 50)
256
+ return "\n".join(lines)
models/anomaly-detection/src/utils/vectorizer.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/anomaly-detection/src/utils/vectorizer.py
3
+ Text vectorization using language-specific BERT models (downloaded locally)
4
+ """
5
+ import os
6
+ import logging
7
+ from typing import Dict, List, Optional, Tuple
8
+ from pathlib import Path
9
+ import numpy as np
10
+
11
+ logger = logging.getLogger("vectorizer")
12
+
13
+ # Transformers
14
+ try:
15
+ from transformers import AutoTokenizer, AutoModel
16
+ import torch
17
+ TRANSFORMERS_AVAILABLE = True
18
+ except ImportError:
19
+ TRANSFORMERS_AVAILABLE = False
20
+ logger.warning("Transformers not available. Install with: pip install transformers torch")
21
+
22
+ # Sentence Transformers for fallback
23
+ try:
24
+ from sentence_transformers import SentenceTransformer
25
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
26
+ except ImportError:
27
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
28
+
29
+
30
+ class MultilingualVectorizer:
31
+ """
32
+ Vectorizer using language-specific BERT models.
33
+ Downloads and caches models locally from HuggingFace.
34
+
35
+ Models:
36
+ - English: distilbert-base-uncased (fast, accurate)
37
+ - Sinhala: keshan/SinhalaBERTo (specialized)
38
+ - Tamil: l3cube-pune/tamil-bert (specialized)
39
+ """
40
+
41
+ MODEL_MAP = {
42
+ "english": "distilbert-base-uncased",
43
+ "sinhala": "keshan/SinhalaBERTo",
44
+ "tamil": "l3cube-pune/tamil-bert"
45
+ }
46
+
47
+ def __init__(self, models_cache_dir: Optional[str] = None, device: Optional[str] = None):
48
+ """
49
+ Initialize the multilingual vectorizer.
50
+
51
+ Args:
52
+ models_cache_dir: Directory to cache downloaded models
53
+ device: 'cuda' or 'cpu' (auto-detected if None)
54
+ """
55
+ self.models_cache_dir = models_cache_dir or str(
56
+ Path(__file__).parent.parent.parent / "models_cache"
57
+ )
58
+ Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
59
+
60
+ # Set cache dir for HuggingFace
61
+ os.environ["TRANSFORMERS_CACHE"] = self.models_cache_dir
62
+ os.environ["HF_HOME"] = self.models_cache_dir
63
+
64
+ # Auto-detect device
65
+ if device is None:
66
+ if TRANSFORMERS_AVAILABLE and torch.cuda.is_available():
67
+ self.device = "cuda"
68
+ else:
69
+ self.device = "cpu"
70
+ else:
71
+ self.device = device
72
+
73
+ logger.info(f"[Vectorizer] Using device: {self.device}")
74
+
75
+ # Lazy load models
76
+ self.models: Dict[str, Tuple] = {} # {lang: (tokenizer, model)}
77
+ self.fallback_model = None
78
+
79
+ def _load_model(self, language: str) -> Tuple:
80
+ """
81
+ Load language-specific model from cache or download.
82
+
83
+ Returns:
84
+ Tuple of (tokenizer, model)
85
+ """
86
+ if language in self.models:
87
+ return self.models[language]
88
+
89
+ model_name = self.MODEL_MAP.get(language, self.MODEL_MAP["english"])
90
+
91
+ if not TRANSFORMERS_AVAILABLE:
92
+ raise RuntimeError("Transformers library not available")
93
+
94
+ logger.info(f"[Vectorizer] Loading model: {model_name}")
95
+
96
+ try:
97
+ tokenizer = AutoTokenizer.from_pretrained(
98
+ model_name,
99
+ cache_dir=self.models_cache_dir
100
+ )
101
+ model = AutoModel.from_pretrained(
102
+ model_name,
103
+ cache_dir=self.models_cache_dir
104
+ ).to(self.device)
105
+ model.eval()
106
+
107
+ self.models[language] = (tokenizer, model)
108
+ logger.info(f"[Vectorizer] ✓ Loaded {model_name} ({language})")
109
+ return tokenizer, model
110
+
111
+ except Exception as e:
112
+ logger.error(f"[Vectorizer] Failed to load {model_name}: {e}")
113
+ # Fallback to English model
114
+ if language != "english":
115
+ logger.info("[Vectorizer] Falling back to English model")
116
+ return self._load_model("english")
117
+ raise
118
+
119
+ def _get_embedding(self, text: str, tokenizer, model) -> np.ndarray:
120
+ """
121
+ Get embedding vector using mean pooling.
122
+
123
+ Args:
124
+ text: Input text
125
+ tokenizer: HuggingFace tokenizer
126
+ model: HuggingFace model
127
+
128
+ Returns:
129
+ 768-dim numpy array
130
+ """
131
+ if not TRANSFORMERS_AVAILABLE:
132
+ raise RuntimeError("Transformers not available")
133
+
134
+ # Tokenize
135
+ inputs = tokenizer(
136
+ text,
137
+ return_tensors="pt",
138
+ truncation=True,
139
+ max_length=512,
140
+ padding=True
141
+ ).to(self.device)
142
+
143
+ # Get embeddings
144
+ with torch.no_grad():
145
+ outputs = model(**inputs)
146
+
147
+ # Mean pooling over sequence length
148
+ attention_mask = inputs["attention_mask"]
149
+ hidden_states = outputs.last_hidden_state
150
+
151
+ # Mask and average
152
+ mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
153
+ sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
154
+ sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
155
+ mean_embedding = sum_embeddings / sum_mask
156
+
157
+ return mean_embedding.cpu().numpy().flatten()
158
+
159
+ def vectorize(self, text: str, language: str = "english") -> np.ndarray:
160
+ """
161
+ Convert text to vector embedding.
162
+
163
+ Args:
164
+ text: Input text
165
+ language: 'english', 'sinhala', 'tamil', or 'unknown'
166
+
167
+ Returns:
168
+ 768-dim numpy array
169
+ """
170
+ if not text or not text.strip():
171
+ return np.zeros(768)
172
+
173
+ # Map unknown to english
174
+ if language == "unknown":
175
+ language = "english"
176
+
177
+ try:
178
+ tokenizer, model = self._load_model(language)
179
+ return self._get_embedding(text, tokenizer, model)
180
+ except Exception as e:
181
+ logger.error(f"[Vectorizer] Error vectorizing: {e}")
182
+ # Return zeros as fallback
183
+ return np.zeros(768)
184
+
185
+ def vectorize_batch(
186
+ self,
187
+ texts: List[str],
188
+ languages: Optional[List[str]] = None
189
+ ) -> np.ndarray:
190
+ """
191
+ Batch vectorization for multiple texts.
192
+
193
+ Args:
194
+ texts: List of text strings
195
+ languages: Optional list of language codes (same length as texts)
196
+
197
+ Returns:
198
+ numpy array of shape (n_texts, 768)
199
+ """
200
+ if languages is None:
201
+ languages = ["english"] * len(texts)
202
+
203
+ embeddings = []
204
+ for text, lang in zip(texts, languages):
205
+ emb = self.vectorize(text, lang)
206
+ embeddings.append(emb)
207
+
208
+ return np.array(embeddings)
209
+
210
+ def download_all_models(self):
211
+ """Pre-download all language models"""
212
+ for language in self.MODEL_MAP.keys():
213
+ try:
214
+ logger.info(f"[Vectorizer] Pre-downloading {language} model...")
215
+ self._load_model(language)
216
+ except Exception as e:
217
+ logger.warning(f"[Vectorizer] Failed to download {language}: {e}")
218
+
219
+
220
+ # Singleton instance
221
+ _vectorizer: Optional[MultilingualVectorizer] = None
222
+
223
+
224
+ def get_vectorizer(models_cache_dir: Optional[str] = None) -> MultilingualVectorizer:
225
+ """Get or create singleton vectorizer instance"""
226
+ global _vectorizer
227
+ if _vectorizer is None:
228
+ _vectorizer = MultilingualVectorizer(models_cache_dir)
229
+ return _vectorizer
230
+
231
+
232
+ def vectorize_text(text: str, language: str = "english") -> np.ndarray:
233
+ """
234
+ Convenience function for text vectorization.
235
+
236
+ Args:
237
+ text: Input text
238
+ language: Language code
239
+
240
+ Returns:
241
+ 768-dim numpy array
242
+ """
243
+ return get_vectorizer().vectorize(text, language)
models/anomaly-detection/tests/dags/test_dag_example.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests."""
2
+
3
+ import os
4
+ import logging
5
+ from contextlib import contextmanager
6
+ import pytest
7
+ from airflow.models import DagBag
8
+
9
+
10
+ @contextmanager
11
+ def suppress_logging(namespace):
12
+ logger = logging.getLogger(namespace)
13
+ old_value = logger.disabled
14
+ logger.disabled = True
15
+ try:
16
+ yield
17
+ finally:
18
+ logger.disabled = old_value
19
+
20
+
21
+ def get_import_errors():
22
+ """
23
+ Generate a tuple for import errors in the dag bag
24
+ """
25
+ with suppress_logging("airflow"):
26
+ dag_bag = DagBag(include_examples=False)
27
+
28
+ def strip_path_prefix(path):
29
+ return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
30
+
31
+ # prepend "(None,None)" to ensure that a test object is always created even if it's a no op.
32
+ return [(None, None)] + [
33
+ (strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()
34
+ ]
35
+
36
+
37
+ def get_dags():
38
+ """
39
+ Generate a tuple of dag_id, <DAG objects> in the DagBag
40
+ """
41
+ with suppress_logging("airflow"):
42
+ dag_bag = DagBag(include_examples=False)
43
+
44
+ def strip_path_prefix(path):
45
+ return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
46
+
47
+ return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()]
48
+
49
+
50
+ @pytest.mark.parametrize(
51
+ "rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
52
+ )
53
+ def test_file_imports(rel_path, rv):
54
+ """Test for import errors on a file"""
55
+ if rel_path and rv:
56
+ raise Exception(f"{rel_path} failed to import with message \n {rv}")
57
+
58
+
59
+ APPROVED_TAGS = {}
60
+
61
+
62
+ @pytest.mark.parametrize(
63
+ "dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()]
64
+ )
65
+ def test_dag_tags(dag_id, dag, fileloc):
66
+ """
67
+ test if a DAG is tagged and if those TAGs are in the approved list
68
+ """
69
+ assert dag.tags, f"{dag_id} in {fileloc} has no tags"
70
+ if APPROVED_TAGS:
71
+ assert not set(dag.tags) - APPROVED_TAGS
72
+
73
+
74
+ @pytest.mark.parametrize(
75
+ "dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()]
76
+ )
77
+ def test_dag_retries(dag_id, dag, fileloc):
78
+ """
79
+ test if a DAG has retries set
80
+ """
81
+ assert (
82
+ dag.default_args.get("retries", None) >= 2
83
+ ), f"{dag_id} in {fileloc} must have task retries >= 2."
models/currency-volatility-prediction/.github/workflows/main.yaml ADDED
File without changes
models/currency-volatility-prediction/.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+
13
+ .env
models/currency-volatility-prediction/.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
models/currency-volatility-prediction/Dockerfile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim-bookworm
2
+ WORKDIR /app
3
+ COPY . /app
4
+
5
+ RUN apt update -y && apt install awscli -y
6
+
7
+ RUN apt-get update && pip install -r requirements.txt
8
+ CMD ["python3", "app.py"]
models/currency-volatility-prediction/README.md ADDED
File without changes
models/currency-volatility-prediction/app.py ADDED
File without changes
models/currency-volatility-prediction/dags/currency_prediction_dag.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/currency-volatility-prediction/dags/currency_prediction_dag.py
3
+ Airflow DAG for daily USD/LKR currency prediction
4
+ Runs at 4:00 AM IST daily
5
+ """
6
+ import os
7
+ import sys
8
+ from datetime import datetime, timedelta
9
+ from pathlib import Path
10
+
11
+ # Add paths for imports
12
+ PIPELINE_ROOT = Path(__file__).parent.parent
13
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
14
+
15
+ from airflow import DAG
16
+ from airflow.operators.python import PythonOperator
17
+
18
+ # Load environment variables from root .env
19
+ try:
20
+ from dotenv import load_dotenv
21
+ # Path: dags/ -> currency-volatility-prediction/ -> models/ -> root/
22
+ env_path = Path(__file__).parent.parent.parent.parent / ".env"
23
+ if env_path.exists():
24
+ load_dotenv(env_path)
25
+ print(f"[MLflow] ✓ Loaded env from {env_path}")
26
+ except ImportError:
27
+ pass
28
+
29
+
30
+ # Default arguments
31
+ default_args = {
32
+ "owner": "modelx",
33
+ "depends_on_past": False,
34
+ "email_on_failure": False,
35
+ "email_on_retry": False,
36
+ "retries": 2,
37
+ "retry_delay": timedelta(minutes=5),
38
+ }
39
+
40
+
41
+ def ingest_data(**context):
42
+ """Task: Ingest currency data from yfinance."""
43
+ from components.data_ingestion import CurrencyDataIngestion
44
+ from entity.config_entity import DataIngestionConfig
45
+
46
+ print("[CURRENCY DAG] Starting data ingestion...")
47
+
48
+ config = DataIngestionConfig(history_period="2y")
49
+ ingestion = CurrencyDataIngestion(config)
50
+
51
+ # Check if we have recent data
52
+ try:
53
+ df = ingestion.load_existing()
54
+ latest_date = df["date"].max()
55
+ if isinstance(latest_date, str):
56
+ latest_date = datetime.strptime(latest_date, "%Y-%m-%d")
57
+
58
+ days_old = (datetime.now() - latest_date).days
59
+
60
+ if days_old < 1:
61
+ print(f"[CURRENCY DAG] Data is current ({days_old} days old)")
62
+ context["ti"].xcom_push(key="data_path", value=str(ingestion.config.raw_data_dir))
63
+ return str(ingestion.config.raw_data_dir)
64
+ except FileNotFoundError:
65
+ pass
66
+
67
+ # Full ingestion
68
+ data_path = ingestion.ingest_all()
69
+ context["ti"].xcom_push(key="data_path", value=data_path)
70
+
71
+ print(f"[CURRENCY DAG] ✓ Data saved to {data_path}")
72
+ return data_path
73
+
74
+
75
+ def train_model(**context):
76
+ """Task: Train GRU model."""
77
+ from components.model_trainer import CurrencyGRUTrainer
78
+ from components.data_ingestion import CurrencyDataIngestion
79
+ from entity.config_entity import ModelTrainerConfig
80
+
81
+ print("[CURRENCY DAG] Starting model training...")
82
+
83
+ # Load data
84
+ ingestion = CurrencyDataIngestion()
85
+ df = ingestion.load_existing()
86
+
87
+ print(f"[CURRENCY DAG] Loaded {len(df)} records")
88
+
89
+ # Train
90
+ config = ModelTrainerConfig(
91
+ epochs=100,
92
+ batch_size=16,
93
+ early_stopping_patience=15
94
+ )
95
+ trainer = CurrencyGRUTrainer(config)
96
+
97
+ results = trainer.train(df=df, use_mlflow=True)
98
+
99
+ print(f"[CURRENCY DAG] ✓ Training complete!")
100
+ print(f" MAE: {results['test_mae']:.4f} LKR")
101
+ print(f" Direction Accuracy: {results['direction_accuracy']*100:.1f}%")
102
+
103
+ context["ti"].xcom_push(key="model_path", value=results["model_path"])
104
+ return results
105
+
106
+
107
+ def generate_prediction(**context):
108
+ """Task: Generate next-day prediction."""
109
+ from components.predictor import CurrencyPredictor
110
+ from components.data_ingestion import CurrencyDataIngestion
111
+
112
+ print("[CURRENCY DAG] Generating prediction...")
113
+
114
+ predictor = CurrencyPredictor()
115
+
116
+ try:
117
+ # Load latest data
118
+ ingestion = CurrencyDataIngestion()
119
+ df = ingestion.load_existing()
120
+
121
+ # Generate prediction
122
+ prediction = predictor.predict(df)
123
+
124
+ except FileNotFoundError:
125
+ # Model not trained, use fallback
126
+ print("[CURRENCY DAG] Model not trained, using fallback")
127
+ prediction = predictor.generate_fallback_prediction()
128
+ except Exception as e:
129
+ print(f"[CURRENCY DAG] Error predicting: {e}")
130
+ prediction = predictor.generate_fallback_prediction()
131
+
132
+ # Save prediction
133
+ output_path = predictor.save_prediction(prediction)
134
+
135
+ print(f"[CURRENCY DAG] ✓ Prediction generated!")
136
+ print(f" Current: {prediction['current_rate']} LKR/USD")
137
+ print(f" Predicted: {prediction['predicted_rate']} LKR/USD")
138
+ print(f" Change: {prediction['expected_change_pct']:+.2f}%")
139
+ print(f" Direction: {prediction['direction']}")
140
+
141
+ context["ti"].xcom_push(key="prediction_path", value=output_path)
142
+ return prediction
143
+
144
+
145
+ def publish_prediction(**context):
146
+ """Task: Log prediction summary."""
147
+ prediction = context["ti"].xcom_pull(task_ids="generate_prediction")
148
+
149
+ if prediction:
150
+ print("\n" + "="*50)
151
+ print(f"USD/LKR CURRENCY PREDICTION")
152
+ print("="*50)
153
+ print(f"Prediction for: {prediction.get('prediction_date')}")
154
+ print(f"Current Rate: {prediction.get('current_rate')} LKR/USD")
155
+ print(f"Predicted Rate: {prediction.get('predicted_rate')} LKR/USD")
156
+ print(f"Expected Change: {prediction.get('expected_change_pct'):+.3f}%")
157
+ print(f"Direction: {prediction.get('direction_emoji')} {prediction.get('direction')}")
158
+ print(f"Volatility: {prediction.get('volatility_class')}")
159
+ if prediction.get('is_fallback'):
160
+ print("⚠️ Using fallback model")
161
+ print("="*50 + "\n")
162
+
163
+ return True
164
+
165
+
166
+ # Define DAG
167
+ with DAG(
168
+ dag_id="currency_prediction_daily",
169
+ default_args=default_args,
170
+ description="Daily USD/LKR currency prediction using GRU neural network",
171
+ schedule_interval="0 4 * * *", # 4:00 AM daily (IST is UTC+5:30)
172
+ start_date=datetime(2024, 12, 1),
173
+ catchup=False,
174
+ tags=["currency", "ml", "prediction", "gru", "forex"],
175
+ ) as dag:
176
+
177
+ # Task 1: Ingest Data
178
+ task_ingest = PythonOperator(
179
+ task_id="ingest_data",
180
+ python_callable=ingest_data,
181
+ provide_context=True,
182
+ )
183
+
184
+ # Task 2: Train Model
185
+ task_train = PythonOperator(
186
+ task_id="train_model",
187
+ python_callable=train_model,
188
+ provide_context=True,
189
+ )
190
+
191
+ # Task 3: Generate Prediction
192
+ task_predict = PythonOperator(
193
+ task_id="generate_prediction",
194
+ python_callable=generate_prediction,
195
+ provide_context=True,
196
+ )
197
+
198
+ # Task 4: Publish Prediction
199
+ task_publish = PythonOperator(
200
+ task_id="publish_prediction",
201
+ python_callable=publish_prediction,
202
+ provide_context=True,
203
+ )
204
+
205
+ # Dependencies
206
+ task_ingest >> task_train >> task_predict >> task_publish
207
+
208
+
209
+ if __name__ == "__main__":
210
+ print("Currency Prediction DAG loaded successfully")
211
+ print(f"Schedule: Daily at 4:00 AM")
212
+ print(f"Tasks: {[t.task_id for t in dag.tasks]}")
models/currency-volatility-prediction/data_schema/schema.yaml ADDED
File without changes
models/currency-volatility-prediction/main.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models/currency-volatility-prediction/main.py
3
+ Entry point for Currency Prediction Pipeline
4
+ Can run data collection, training, or prediction independently
5
+ """
6
+ import os
7
+ import sys
8
+ import logging
9
+ import argparse
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+
13
+ # Setup paths
14
+ PIPELINE_ROOT = Path(__file__).parent
15
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
16
+
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
+ )
21
+ logger = logging.getLogger("currency_prediction")
22
+
23
+
24
+ def run_data_ingestion(period: str = "2y"):
25
+ """Run data ingestion from yfinance."""
26
+ from components.data_ingestion import CurrencyDataIngestion
27
+ from entity.config_entity import DataIngestionConfig
28
+
29
+ logger.info(f"Starting data ingestion ({period})...")
30
+
31
+ config = DataIngestionConfig(history_period=period)
32
+ ingestion = CurrencyDataIngestion(config)
33
+
34
+ data_path = ingestion.ingest_all()
35
+
36
+ df = ingestion.load_existing(data_path)
37
+
38
+ logger.info("Data Ingestion Complete!")
39
+ logger.info(f"Total records: {len(df)}")
40
+ logger.info(f"Features: {len(df.columns)}")
41
+ logger.info(f"Date range: {df['date'].min()} to {df['date'].max()}")
42
+ logger.info(f"Latest rate: {df['close'].iloc[-1]:.2f} LKR/USD")
43
+
44
+ return data_path
45
+
46
+
47
+ def run_training(epochs: int = 100):
48
+ """Run GRU model training."""
49
+ from components.data_ingestion import CurrencyDataIngestion
50
+ from components.model_trainer import CurrencyGRUTrainer
51
+ from entity.config_entity import ModelTrainerConfig
52
+
53
+ logger.info("Starting model training...")
54
+
55
+ # Load data
56
+ ingestion = CurrencyDataIngestion()
57
+ df = ingestion.load_existing()
58
+
59
+ logger.info(f"Loaded {len(df)} records with {len(df.columns)} features")
60
+
61
+ # Train
62
+ config = ModelTrainerConfig(epochs=epochs)
63
+ trainer = CurrencyGRUTrainer(config)
64
+
65
+ results = trainer.train(df=df, use_mlflow=True)
66
+
67
+ logger.info(f"\nTraining Results:")
68
+ logger.info(f" MAE: {results['test_mae']:.4f} LKR")
69
+ logger.info(f" RMSE: {results['rmse']:.4f} LKR")
70
+ logger.info(f" Direction Accuracy: {results['direction_accuracy']*100:.1f}%")
71
+ logger.info(f" Epochs: {results['epochs_trained']}")
72
+ logger.info(f" Model saved: {results['model_path']}")
73
+
74
+ return results
75
+
76
+
77
+ def run_prediction():
78
+ """Run prediction for next day."""
79
+ from components.data_ingestion import CurrencyDataIngestion
80
+ from components.predictor import CurrencyPredictor
81
+
82
+ logger.info("Generating prediction...")
83
+
84
+ predictor = CurrencyPredictor()
85
+
86
+ try:
87
+ ingestion = CurrencyDataIngestion()
88
+ df = ingestion.load_existing()
89
+ prediction = predictor.predict(df)
90
+ except FileNotFoundError:
91
+ logger.warning("Model not trained, using fallback")
92
+ prediction = predictor.generate_fallback_prediction()
93
+ except Exception as e:
94
+ logger.error(f"Error: {e}")
95
+ prediction = predictor.generate_fallback_prediction()
96
+
97
+ output_path = predictor.save_prediction(prediction)
98
+
99
+ # Display
100
+ logger.info(f"\n{'='*50}")
101
+ logger.info(f"USD/LKR PREDICTION FOR {prediction['prediction_date']}")
102
+ logger.info(f"{'='*50}")
103
+ logger.info(f"Current Rate: {prediction['current_rate']:.2f} LKR/USD")
104
+ logger.info(f"Predicted Rate: {prediction['predicted_rate']:.2f} LKR/USD")
105
+ logger.info(f"Expected Change: {prediction['expected_change_pct']:+.3f}%")
106
+ logger.info(f"Direction: {prediction['direction_emoji']} LKR {prediction['direction']}")
107
+ logger.info(f"Volatility: {prediction['volatility_class']}")
108
+
109
+ if prediction.get('weekly_trend'):
110
+ logger.info(f"Weekly Trend: {prediction['weekly_trend']:+.2f}%")
111
+ if prediction.get('monthly_trend'):
112
+ logger.info(f"Monthly Trend: {prediction['monthly_trend']:+.2f}%")
113
+
114
+ logger.info(f"{'='*50}")
115
+ logger.info(f"Saved to: {output_path}")
116
+
117
+ return prediction
118
+
119
+
120
+ def run_full_pipeline():
121
+ """Run the complete pipeline: ingest → train → predict."""
122
+ logger.info("=" * 60)
123
+ logger.info("CURRENCY PREDICTION PIPELINE - FULL RUN")
124
+ logger.info("=" * 60)
125
+
126
+ # Step 1: Data Ingestion
127
+ try:
128
+ run_data_ingestion(period="2y")
129
+ except Exception as e:
130
+ logger.error(f"Data ingestion failed: {e}")
131
+ return None
132
+
133
+ # Step 2: Training
134
+ try:
135
+ run_training(epochs=100)
136
+ except Exception as e:
137
+ logger.error(f"Training failed: {e}")
138
+
139
+ # Step 3: Prediction
140
+ prediction = run_prediction()
141
+
142
+ logger.info("=" * 60)
143
+ logger.info("PIPELINE COMPLETE!")
144
+ logger.info("=" * 60)
145
+
146
+ return prediction
147
+
148
+
149
+ if __name__ == "__main__":
150
+ parser = argparse.ArgumentParser(description="Currency Prediction Pipeline")
151
+ parser.add_argument(
152
+ "--mode",
153
+ choices=["ingest", "train", "predict", "full"],
154
+ default="predict",
155
+ help="Pipeline mode to run"
156
+ )
157
+ parser.add_argument(
158
+ "--period",
159
+ type=str,
160
+ default="2y",
161
+ help="Data period (1y, 2y, 5y)"
162
+ )
163
+ parser.add_argument(
164
+ "--epochs",
165
+ type=int,
166
+ default=100,
167
+ help="Training epochs"
168
+ )
169
+
170
+ args = parser.parse_args()
171
+
172
+ if args.mode == "ingest":
173
+ run_data_ingestion(period=args.period)
174
+ elif args.mode == "train":
175
+ run_training(epochs=args.epochs)
176
+ elif args.mode == "predict":
177
+ run_prediction()
178
+ elif args.mode == "full":
179
+ run_full_pipeline()
models/currency-volatility-prediction/pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "modelx-final-models"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "certifi>=2025.11.12",
9
+ "dagshub>=0.6.3",
10
+ "fastapi>=0.122.0",
11
+ "mlflow>=3.6.0",
12
+ "numpy>=2.3.5",
13
+ "pandas>=2.3.3",
14
+ "pyaml>=25.7.0",
15
+ "pymongo[srv]>=4.15.4",
16
+ "python-dotenv>=1.2.1",
17
+ "python-multipart>=0.0.20",
18
+ "scikit-learn>=1.7.2",
19
+ "uvicorn>=0.38.0",
20
+ ]
models/currency-volatility-prediction/requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv
2
+ pandas
3
+ numpy
4
+ pymongo
5
+ certifi
6
+ pymongo[srv]
7
+ scikit-learn
8
+ mlflow
9
+ pyaml
10
+ dagshub
11
+ fastapi
12
+ uvicorn
13
+ python-multipart
14
+
15
+ #-e .
models/currency-volatility-prediction/setup.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ '''
3
+ The setup.py file is an essential part of packaging and
4
+ distributing Python projects. It is used by setuptools
5
+ (or for distutils in older Python versions) to define the configuration
6
+ of your project, such as its metadata, dependencies, and more
7
+ '''
8
+
9
+ from setuptools import find_packages, setup
10
+ # this scans through all the folders and gets the folders that has the __init__ file
11
+ # setup is reponsible of providing all the information about the project
12
+
13
+ from typing import List
14
+
15
+ def get_requirements()->List[str]:
16
+ """
17
+ This function will return a list of requirements
18
+ """
19
+ requirement_lst:List[str]=[]
20
+ try:
21
+ with open('requirements.txt', 'r') as file:
22
+ # Read lines from the file
23
+ lines=file.readlines()
24
+ ## Process each line
25
+ for line in lines:
26
+ requirement=line.strip()
27
+ ## Ignore empty lines and -e .
28
+
29
+ if requirement and requirement != '-e .':
30
+ requirement_lst.append(requirement)
31
+
32
+
33
+
34
+ except FileNotFoundError:
35
+ print("requirements.txt file not found")
36
+
37
+ return requirement_lst
38
+
39
+ setup(
40
+ name="NetworkSecurity",
41
+ version="0.0.1", # This can be changed later
42
+ author="Nivakaran S.",
43
+ author_email="nivakaran@hotmail.com",
44
+ packages=find_packages(),
45
+ install_requires=get_requirements()
46
+ )
47
+
models/currency-volatility-prediction/src/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ LOG_FILE=f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
6
+
7
+ logs_path=os.path.join(os.getcwd(), "logs", LOG_FILE)
8
+
9
+ os.makedirs(logs_path, exist_ok=True)
10
+ # Create the file only if it is not created
11
+
12
+ LOG_FILE_PATH=os.path.join(logs_path, LOG_FILE)
13
+
14
+ logging.basicConfig(
15
+ filename=LOG_FILE_PATH,
16
+ format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
17
+ level=logging.INFO
18
+ )
19
+
20
+
21
+
models/currency-volatility-prediction/src/components/__init__.py ADDED
File without changes