Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.template +46 -0
- .gitignore +52 -8
- models/anomaly-detection/.dockerignore +8 -0
- models/anomaly-detection/.gitignore +11 -0
- models/anomaly-detection/Dockerfile +1 -0
- models/anomaly-detection/README.md +45 -0
- models/anomaly-detection/artifacts/data_ingestion/ingested_data_20251208_152330.parquet +3 -0
- models/anomaly-detection/artifacts/data_transformation/embeddings_20251208_152444.npy +3 -0
- models/anomaly-detection/artifacts/data_transformation/features_20251208_152444.npy +3 -0
- models/anomaly-detection/artifacts/data_transformation/transformed_data_20251208_152444.parquet +3 -0
- models/anomaly-detection/artifacts/data_validation/validated_data_20251208_152332.parquet +3 -0
- models/anomaly-detection/artifacts/data_validation/validation_report_20251208_152332.yaml +7 -0
- models/anomaly-detection/dags/.airflowignore +0 -0
- models/anomaly-detection/dags/exampledag.py +100 -0
- models/anomaly-detection/dags/train_anomaly_model.py +241 -0
- models/anomaly-detection/data_schema/schema.yaml +133 -0
- models/anomaly-detection/download_models.py +86 -0
- models/anomaly-detection/main.py +85 -0
- models/anomaly-detection/packages.txt +0 -0
- models/anomaly-detection/requirements.txt +37 -0
- models/anomaly-detection/src/__init__.py +18 -0
- models/anomaly-detection/src/components/__init__.py +29 -0
- models/anomaly-detection/src/components/data_ingestion.py +247 -0
- models/anomaly-detection/src/components/data_transformation.py +458 -0
- models/anomaly-detection/src/components/data_validation.py +261 -0
- models/anomaly-detection/src/components/model_trainer.py +478 -0
- models/anomaly-detection/src/entity/__init__.py +30 -0
- models/anomaly-detection/src/entity/artifact_entity.py +79 -0
- models/anomaly-detection/src/entity/config_entity.py +109 -0
- models/anomaly-detection/src/pipeline/__init__.py +6 -0
- models/anomaly-detection/src/pipeline/training_pipeline.py +162 -0
- models/anomaly-detection/src/utils/__init__.py +24 -0
- models/anomaly-detection/src/utils/language_detector.py +209 -0
- models/anomaly-detection/src/utils/metrics.py +256 -0
- models/anomaly-detection/src/utils/vectorizer.py +243 -0
- models/anomaly-detection/tests/dags/test_dag_example.py +83 -0
- models/currency-volatility-prediction/.github/workflows/main.yaml +0 -0
- models/currency-volatility-prediction/.gitignore +13 -0
- models/currency-volatility-prediction/.python-version +1 -0
- models/currency-volatility-prediction/Dockerfile +8 -0
- models/currency-volatility-prediction/README.md +0 -0
- models/currency-volatility-prediction/app.py +0 -0
- models/currency-volatility-prediction/dags/currency_prediction_dag.py +212 -0
- models/currency-volatility-prediction/data_schema/schema.yaml +0 -0
- models/currency-volatility-prediction/main.py +179 -0
- models/currency-volatility-prediction/pyproject.toml +20 -0
- models/currency-volatility-prediction/requirements.txt +15 -0
- models/currency-volatility-prediction/setup.py +47 -0
- models/currency-volatility-prediction/src/__init__.py +21 -0
- models/currency-volatility-prediction/src/components/__init__.py +0 -0
.env.template
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ModelX Intelligence Platform - Production Configuration
|
| 2 |
+
# Copy this to .env and fill in your values
|
| 3 |
+
|
| 4 |
+
# =============================================================================
|
| 5 |
+
# LLM CONFIGURATION
|
| 6 |
+
# =============================================================================
|
| 7 |
+
# Get your free API key from https://console.groq.com
|
| 8 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 9 |
+
|
| 10 |
+
# =============================================================================
|
| 11 |
+
# STORAGE CONFIGURATION
|
| 12 |
+
# =============================================================================
|
| 13 |
+
|
| 14 |
+
# SQLite Cache
|
| 15 |
+
SQLITE_DB_PATH=data/cache/feeds.db
|
| 16 |
+
SQLITE_RETENTION_HOURS=24
|
| 17 |
+
|
| 18 |
+
# ChromaDB (Semantic Search)
|
| 19 |
+
CHROMADB_PATH=data/chromadb
|
| 20 |
+
CHROMADB_COLLECTION=modelx_feeds
|
| 21 |
+
CHROMADB_SIMILARITY_THRESHOLD=0.85
|
| 22 |
+
CHROMADB_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 23 |
+
|
| 24 |
+
# Neo4j Knowledge Graph
|
| 25 |
+
NEO4J_ENABLED=false # Set to 'true' to enable (requires Docker)
|
| 26 |
+
NEO4J_URI=bolt://localhost:7687
|
| 27 |
+
NEO4J_USER=neo4j
|
| 28 |
+
NEO4J_PASSWORD=modelx2024
|
| 29 |
+
|
| 30 |
+
# CSV Export
|
| 31 |
+
CSV_EXPORT_DIR=data/feeds
|
| 32 |
+
|
| 33 |
+
# Deduplication
|
| 34 |
+
EXACT_MATCH_CHARS=120
|
| 35 |
+
|
| 36 |
+
# =============================================================================
|
| 37 |
+
# API CONFIGURATION
|
| 38 |
+
# =============================================================================
|
| 39 |
+
API_HOST=0.0.0.0
|
| 40 |
+
API_PORT=8000
|
| 41 |
+
API_WORKERS=1
|
| 42 |
+
|
| 43 |
+
# =============================================================================
|
| 44 |
+
# FRONTEND CONFIGURATION
|
| 45 |
+
# =============================================================================
|
| 46 |
+
NEXT_PUBLIC_API_URL=http://localhost:8000
|
.gitignore
CHANGED
|
@@ -9,15 +9,59 @@ wheels/
|
|
| 9 |
# Virtual environments
|
| 10 |
.venv
|
| 11 |
|
| 12 |
-
|
| 13 |
-
.env
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
data/
|
| 19 |
datasets/
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Virtual environments
|
| 10 |
.venv
|
| 11 |
|
| 12 |
+
# Environment files
|
| 13 |
+
.env
|
| 14 |
|
| 15 |
+
# LangGraph
|
| 16 |
+
.langgraph_api
|
| 17 |
|
| 18 |
+
# =============================================================================
|
| 19 |
+
# ML MODEL ARTIFACTS (Large files - don't push to Git)
|
| 20 |
+
# =============================================================================
|
| 21 |
+
|
| 22 |
+
# Trained model weights (large binary files)
|
| 23 |
+
*.h5
|
| 24 |
+
*.hdf5
|
| 25 |
+
*.joblib
|
| 26 |
+
*.pkl
|
| 27 |
+
*.pickle
|
| 28 |
+
*.pt
|
| 29 |
+
*.pth
|
| 30 |
+
*.onnx
|
| 31 |
+
*.pb
|
| 32 |
+
|
| 33 |
+
# Model output directories
|
| 34 |
+
models/*/artifacts/models/
|
| 35 |
+
models/*/output/
|
| 36 |
+
models/*/models_cache/
|
| 37 |
+
models/*/checkpoints/
|
| 38 |
+
|
| 39 |
+
# Airflow local state
|
| 40 |
+
models/*/.astro/
|
| 41 |
+
|
| 42 |
+
# MLflow artifacts (tracked separately)
|
| 43 |
+
mlruns/
|
| 44 |
+
mlartifacts/
|
| 45 |
+
|
| 46 |
+
# =============================================================================
|
| 47 |
+
# DATA FILES (Can be large)
|
| 48 |
+
# =============================================================================
|
| 49 |
data/
|
| 50 |
datasets/
|
| 51 |
+
|
| 52 |
+
# Database files
|
| 53 |
+
*.db
|
| 54 |
+
*.sqlite
|
| 55 |
+
*.sqlite3
|
| 56 |
+
|
| 57 |
+
# ChromaDB persistence (can be large)
|
| 58 |
+
chroma_db/
|
| 59 |
+
|
| 60 |
+
# =============================================================================
|
| 61 |
+
# KEEP THESE (source code, configs)
|
| 62 |
+
# =============================================================================
|
| 63 |
+
# The models/ folders themselves ARE tracked for:
|
| 64 |
+
# - main.py, src/, dags/ (pipeline code)
|
| 65 |
+
# - requirements.txt, setup.py (dependencies)
|
| 66 |
+
# - data_schema/ (validation configs)
|
| 67 |
+
# - README.md (documentation)
|
models/anomaly-detection/.dockerignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
astro
|
| 2 |
+
.git
|
| 3 |
+
.env
|
| 4 |
+
airflow_settings.yaml
|
| 5 |
+
logs/
|
| 6 |
+
.venv
|
| 7 |
+
airflow.db
|
| 8 |
+
airflow.cfg
|
models/anomaly-detection/.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.env
|
| 3 |
+
.DS_Store
|
| 4 |
+
airflow_settings.yaml
|
| 5 |
+
__pycache__/
|
| 6 |
+
astro
|
| 7 |
+
.venv
|
| 8 |
+
airflow-webserver.pid
|
| 9 |
+
webserver_config.py
|
| 10 |
+
airflow.cfg
|
| 11 |
+
airflow.db
|
models/anomaly-detection/Dockerfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
FROM astrocrpublic.azurecr.io/runtime:3.1-7
|
models/anomaly-detection/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Overview
|
| 2 |
+
========
|
| 3 |
+
|
| 4 |
+
Welcome to Astronomer! This project was generated after you ran 'astro dev init' using the Astronomer CLI. This readme describes the contents of the project, as well as how to run Apache Airflow on your local machine.
|
| 5 |
+
|
| 6 |
+
Project Contents
|
| 7 |
+
================
|
| 8 |
+
|
| 9 |
+
Your Astro project contains the following files and folders:
|
| 10 |
+
|
| 11 |
+
- dags: This folder contains the Python files for your Airflow DAGs. By default, this directory includes one example DAG:
|
| 12 |
+
- `example_astronauts`: This DAG shows a simple ETL pipeline example that queries the list of astronauts currently in space from the Open Notify API and prints a statement for each astronaut. The DAG uses the TaskFlow API to define tasks in Python, and dynamic task mapping to dynamically print a statement for each astronaut. For more on how this DAG works, see our [Getting started tutorial](https://www.astronomer.io/docs/learn/get-started-with-airflow).
|
| 13 |
+
- Dockerfile: This file contains a versioned Astro Runtime Docker image that provides a differentiated Airflow experience. If you want to execute other commands or overrides at runtime, specify them here.
|
| 14 |
+
- include: This folder contains any additional files that you want to include as part of your project. It is empty by default.
|
| 15 |
+
- packages.txt: Install OS-level packages needed for your project by adding them to this file. It is empty by default.
|
| 16 |
+
- requirements.txt: Install Python packages needed for your project by adding them to this file. It is empty by default.
|
| 17 |
+
- plugins: Add custom or community plugins for your project to this file. It is empty by default.
|
| 18 |
+
- airflow_settings.yaml: Use this local-only file to specify Airflow Connections, Variables, and Pools instead of entering them in the Airflow UI as you develop DAGs in this project.
|
| 19 |
+
|
| 20 |
+
Deploy Your Project Locally
|
| 21 |
+
===========================
|
| 22 |
+
|
| 23 |
+
Start Airflow on your local machine by running 'astro dev start'.
|
| 24 |
+
|
| 25 |
+
This command will spin up five Docker containers on your machine, each for a different Airflow component:
|
| 26 |
+
|
| 27 |
+
- Postgres: Airflow's Metadata Database
|
| 28 |
+
- Scheduler: The Airflow component responsible for monitoring and triggering tasks
|
| 29 |
+
- DAG Processor: The Airflow component responsible for parsing DAGs
|
| 30 |
+
- API Server: The Airflow component responsible for serving the Airflow UI and API
|
| 31 |
+
- Triggerer: The Airflow component responsible for triggering deferred tasks
|
| 32 |
+
|
| 33 |
+
When all five containers are ready the command will open the browser to the Airflow UI at http://localhost:8080/. You should also be able to access your Postgres Database at 'localhost:5432/postgres' with username 'postgres' and password 'postgres'.
|
| 34 |
+
|
| 35 |
+
Note: If you already have either of the above ports allocated, you can either [stop your existing Docker containers or change the port](https://www.astronomer.io/docs/astro/cli/troubleshoot-locally#ports-are-not-available-for-my-local-airflow-webserver).
|
| 36 |
+
|
| 37 |
+
Deploy Your Project to Astronomer
|
| 38 |
+
=================================
|
| 39 |
+
|
| 40 |
+
If you have an Astronomer account, pushing code to a Deployment on Astronomer is simple. For deploying instructions, refer to Astronomer documentation: https://www.astronomer.io/docs/astro/deploy-code/
|
| 41 |
+
|
| 42 |
+
Contact
|
| 43 |
+
=======
|
| 44 |
+
|
| 45 |
+
The Astronomer CLI is maintained with love by the Astronomer team. To report a bug or suggest a change, reach out to our support.
|
models/anomaly-detection/artifacts/data_ingestion/ingested_data_20251208_152330.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3273d845dda217a76cae1c63e9e181fad07c212ed7bbbce008ceaa380012e586
|
| 3 |
+
size 104390
|
models/anomaly-detection/artifacts/data_transformation/embeddings_20251208_152444.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f476c6f7aee3c0d1acb74e054dd4a3b5c47170b8f476f5f273a9a87c106d4d14
|
| 3 |
+
size 586880
|
models/anomaly-detection/artifacts/data_transformation/features_20251208_152444.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fad8dcf39385d403aca32cae883fa2030055bef9d0e5ee589bd86799adad69e3
|
| 3 |
+
size 1185856
|
models/anomaly-detection/artifacts/data_transformation/transformed_data_20251208_152444.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42202d32a4c0a02477aa9b90c902a2dd0ce057dc66562a1629198636d5431f80
|
| 3 |
+
size 112761
|
models/anomaly-detection/artifacts/data_validation/validated_data_20251208_152332.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3273d845dda217a76cae1c63e9e181fad07c212ed7bbbce008ceaa380012e586
|
| 3 |
+
size 104390
|
models/anomaly-detection/artifacts/data_validation/validation_report_20251208_152332.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
errors: []
|
| 2 |
+
input_path: C:\Users\LENOVO\Desktop\ModelX-Ultimate\models\anomaly-detection\artifacts\data_ingestion\ingested_data_20251208_152330.parquet
|
| 3 |
+
invalid_records: 0
|
| 4 |
+
total_records: 191
|
| 5 |
+
valid_records: 191
|
| 6 |
+
validation_status: true
|
| 7 |
+
validation_timestamp: '20251208_152332'
|
models/anomaly-detection/dags/.airflowignore
ADDED
|
File without changes
|
models/anomaly-detection/dags/exampledag.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
## Astronaut ETL example DAG
|
| 3 |
+
|
| 4 |
+
This DAG queries the list of astronauts currently in space from the
|
| 5 |
+
Open Notify API and prints each astronaut's name and flying craft.
|
| 6 |
+
|
| 7 |
+
There are two tasks, one to get the data from the API and save the results,
|
| 8 |
+
and another to print the results. Both tasks are written in Python using
|
| 9 |
+
Airflow's TaskFlow API, which allows you to easily turn Python functions into
|
| 10 |
+
Airflow tasks, and automatically infer dependencies and pass data.
|
| 11 |
+
|
| 12 |
+
The second task uses dynamic task mapping to create a copy of the task for
|
| 13 |
+
each Astronaut in the list retrieved from the API. This list will change
|
| 14 |
+
depending on how many Astronauts are in space, and the DAG will adjust
|
| 15 |
+
accordingly each time it runs.
|
| 16 |
+
|
| 17 |
+
For more explanation and getting started instructions, see our Write your
|
| 18 |
+
first DAG tutorial: https://www.astronomer.io/docs/learn/get-started-with-airflow
|
| 19 |
+
|
| 20 |
+

|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from airflow.sdk.definitions.asset import Asset
|
| 24 |
+
from airflow.decorators import dag, task
|
| 25 |
+
from pendulum import datetime
|
| 26 |
+
import requests
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Define the basic parameters of the DAG, like schedule and start_date
|
| 30 |
+
@dag(
|
| 31 |
+
start_date=datetime(2024, 1, 1),
|
| 32 |
+
schedule="@daily",
|
| 33 |
+
catchup=False,
|
| 34 |
+
doc_md=__doc__,
|
| 35 |
+
default_args={"owner": "Astro", "retries": 3},
|
| 36 |
+
tags=["example"],
|
| 37 |
+
)
|
| 38 |
+
def example_astronauts():
|
| 39 |
+
# Define tasks
|
| 40 |
+
@task(
|
| 41 |
+
# Define a dataset outlet for the task. This can be used to schedule downstream DAGs when this task has run.
|
| 42 |
+
outlets=[Asset("current_astronauts")]
|
| 43 |
+
) # Define that this task updates the `current_astronauts` Dataset
|
| 44 |
+
def get_astronauts(**context) -> list[dict]:
|
| 45 |
+
"""
|
| 46 |
+
This task uses the requests library to retrieve a list of Astronauts
|
| 47 |
+
currently in space. The results are pushed to XCom with a specific key
|
| 48 |
+
so they can be used in a downstream pipeline. The task returns a list
|
| 49 |
+
of Astronauts to be used in the next task.
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
r = requests.get("http://api.open-notify.org/astros.json")
|
| 53 |
+
r.raise_for_status()
|
| 54 |
+
number_of_people_in_space = r.json()["number"]
|
| 55 |
+
list_of_people_in_space = r.json()["people"]
|
| 56 |
+
except Exception:
|
| 57 |
+
print("API currently not available, using hardcoded data instead.")
|
| 58 |
+
number_of_people_in_space = 12
|
| 59 |
+
list_of_people_in_space = [
|
| 60 |
+
{"craft": "ISS", "name": "Oleg Kononenko"},
|
| 61 |
+
{"craft": "ISS", "name": "Nikolai Chub"},
|
| 62 |
+
{"craft": "ISS", "name": "Tracy Caldwell Dyson"},
|
| 63 |
+
{"craft": "ISS", "name": "Matthew Dominick"},
|
| 64 |
+
{"craft": "ISS", "name": "Michael Barratt"},
|
| 65 |
+
{"craft": "ISS", "name": "Jeanette Epps"},
|
| 66 |
+
{"craft": "ISS", "name": "Alexander Grebenkin"},
|
| 67 |
+
{"craft": "ISS", "name": "Butch Wilmore"},
|
| 68 |
+
{"craft": "ISS", "name": "Sunita Williams"},
|
| 69 |
+
{"craft": "Tiangong", "name": "Li Guangsu"},
|
| 70 |
+
{"craft": "Tiangong", "name": "Li Cong"},
|
| 71 |
+
{"craft": "Tiangong", "name": "Ye Guangfu"},
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
context["ti"].xcom_push(
|
| 75 |
+
key="number_of_people_in_space", value=number_of_people_in_space
|
| 76 |
+
)
|
| 77 |
+
return list_of_people_in_space
|
| 78 |
+
|
| 79 |
+
@task
|
| 80 |
+
def print_astronaut_craft(greeting: str, person_in_space: dict) -> None:
|
| 81 |
+
"""
|
| 82 |
+
This task creates a print statement with the name of an
|
| 83 |
+
Astronaut in space and the craft they are flying on from
|
| 84 |
+
the API request results of the previous task, along with a
|
| 85 |
+
greeting which is hard-coded in this example.
|
| 86 |
+
"""
|
| 87 |
+
craft = person_in_space["craft"]
|
| 88 |
+
name = person_in_space["name"]
|
| 89 |
+
|
| 90 |
+
print(f"{name} is currently in space flying on the {craft}! {greeting}")
|
| 91 |
+
|
| 92 |
+
# Use dynamic task mapping to run the print_astronaut_craft task for each
|
| 93 |
+
# Astronaut in space
|
| 94 |
+
print_astronaut_craft.partial(greeting="Hello! :)").expand(
|
| 95 |
+
person_in_space=get_astronauts() # Define dependencies using TaskFlow API syntax
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Instantiate the DAG
|
| 100 |
+
example_astronauts()
|
models/anomaly-detection/dags/train_anomaly_model.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/dags/train_anomaly_model.py
|
| 3 |
+
Apache Airflow DAG for scheduled anomaly detection model training
|
| 4 |
+
Uses Astronomer (Astro) for deployment
|
| 5 |
+
"""
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from airflow import DAG
|
| 8 |
+
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
| 9 |
+
from airflow.operators.empty import EmptyOperator
|
| 10 |
+
from airflow.sensors.python import PythonSensor
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
# Add project to path
|
| 16 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 17 |
+
sys.path.insert(0, PROJECT_ROOT)
|
| 18 |
+
|
| 19 |
+
# Load .env from root ModelX directory for MLflow credentials
|
| 20 |
+
try:
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
root_env = os.path.join(PROJECT_ROOT, '..', '..', '.env')
|
| 23 |
+
if os.path.exists(root_env):
|
| 24 |
+
load_dotenv(root_env)
|
| 25 |
+
else:
|
| 26 |
+
load_dotenv() # Try default locations
|
| 27 |
+
except ImportError:
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
# Configuration
|
| 33 |
+
BATCH_THRESHOLD = int(os.getenv("BATCH_THRESHOLD", "1000"))
|
| 34 |
+
SQLITE_DB_PATH = os.getenv("SQLITE_DB_PATH", "")
|
| 35 |
+
|
| 36 |
+
# Default DAG arguments
|
| 37 |
+
default_args = {
|
| 38 |
+
'owner': 'modelx',
|
| 39 |
+
'depends_on_past': False,
|
| 40 |
+
'email_on_failure': False,
|
| 41 |
+
'email_on_retry': False,
|
| 42 |
+
'retries': 2,
|
| 43 |
+
'retry_delay': timedelta(minutes=5),
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def check_new_records(**context) -> bool:
|
| 48 |
+
"""
|
| 49 |
+
Sensor function to check if enough new records exist.
|
| 50 |
+
Returns True if batch threshold is met or daily run is due.
|
| 51 |
+
"""
|
| 52 |
+
import sqlite3
|
| 53 |
+
from datetime import datetime, timedelta
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
# Get last training timestamp from XCom or default to 24h ago
|
| 57 |
+
last_training = context['ti'].xcom_pull(key='last_training_timestamp')
|
| 58 |
+
if not last_training:
|
| 59 |
+
last_training = (datetime.utcnow() - timedelta(hours=24)).isoformat()
|
| 60 |
+
|
| 61 |
+
# Check SQLite for new records
|
| 62 |
+
if SQLITE_DB_PATH and os.path.exists(SQLITE_DB_PATH):
|
| 63 |
+
conn = sqlite3.connect(SQLITE_DB_PATH)
|
| 64 |
+
cursor = conn.execute(
|
| 65 |
+
'SELECT COUNT(*) FROM seen_hashes WHERE last_seen > ?',
|
| 66 |
+
(last_training,)
|
| 67 |
+
)
|
| 68 |
+
new_records = cursor.fetchone()[0]
|
| 69 |
+
conn.close()
|
| 70 |
+
|
| 71 |
+
logger.info(f"[AnomalyDAG] New records since {last_training}: {new_records}")
|
| 72 |
+
|
| 73 |
+
if new_records >= BATCH_THRESHOLD:
|
| 74 |
+
logger.info(f"[AnomalyDAG] Batch threshold met ({new_records} >= {BATCH_THRESHOLD})")
|
| 75 |
+
return True
|
| 76 |
+
|
| 77 |
+
# Check if 24 hours have passed (daily fallback)
|
| 78 |
+
if last_training:
|
| 79 |
+
last_dt = datetime.fromisoformat(last_training)
|
| 80 |
+
hours_since = (datetime.utcnow() - last_dt).total_seconds() / 3600
|
| 81 |
+
if hours_since >= 24:
|
| 82 |
+
logger.info(f"[AnomalyDAG] Daily run triggered ({hours_since:.1f}h since last run)")
|
| 83 |
+
return True
|
| 84 |
+
|
| 85 |
+
logger.info(f"[AnomalyDAG] Waiting for more records...")
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"[AnomalyDAG] Error checking records: {e}")
|
| 90 |
+
# Trigger anyway on error
|
| 91 |
+
return True
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def run_data_ingestion(**context):
|
| 95 |
+
"""Run data ingestion step"""
|
| 96 |
+
from src.components import DataIngestion
|
| 97 |
+
from src.entity import DataIngestionConfig
|
| 98 |
+
|
| 99 |
+
config = DataIngestionConfig()
|
| 100 |
+
ingestion = DataIngestion(config)
|
| 101 |
+
artifact = ingestion.ingest()
|
| 102 |
+
|
| 103 |
+
# Store artifact path in XCom
|
| 104 |
+
context['ti'].xcom_push(key='ingestion_artifact', value={
|
| 105 |
+
'raw_data_path': artifact.raw_data_path,
|
| 106 |
+
'total_records': artifact.total_records,
|
| 107 |
+
'is_data_available': artifact.is_data_available
|
| 108 |
+
})
|
| 109 |
+
|
| 110 |
+
if not artifact.is_data_available:
|
| 111 |
+
raise ValueError("No data available for training")
|
| 112 |
+
|
| 113 |
+
return artifact.raw_data_path
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def run_data_validation(**context):
|
| 117 |
+
"""Run data validation step"""
|
| 118 |
+
from src.components import DataValidation
|
| 119 |
+
from src.entity import DataValidationConfig
|
| 120 |
+
|
| 121 |
+
# Get ingestion output from XCom
|
| 122 |
+
ingestion = context['ti'].xcom_pull(key='ingestion_artifact', task_ids='data_ingestion')
|
| 123 |
+
raw_data_path = ingestion['raw_data_path']
|
| 124 |
+
|
| 125 |
+
config = DataValidationConfig()
|
| 126 |
+
validation = DataValidation(config)
|
| 127 |
+
artifact = validation.validate(raw_data_path)
|
| 128 |
+
|
| 129 |
+
# Store artifact in XCom
|
| 130 |
+
context['ti'].xcom_push(key='validation_artifact', value={
|
| 131 |
+
'validated_data_path': artifact.validated_data_path,
|
| 132 |
+
'validation_status': artifact.validation_status,
|
| 133 |
+
'valid_records': artifact.valid_records
|
| 134 |
+
})
|
| 135 |
+
|
| 136 |
+
return artifact.validated_data_path
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def run_data_transformation(**context):
|
| 140 |
+
"""Run data transformation step"""
|
| 141 |
+
from src.components import DataTransformation
|
| 142 |
+
from src.entity import DataTransformationConfig
|
| 143 |
+
|
| 144 |
+
# Get validation output from XCom
|
| 145 |
+
validation = context['ti'].xcom_pull(key='validation_artifact', task_ids='data_validation')
|
| 146 |
+
validated_data_path = validation['validated_data_path']
|
| 147 |
+
|
| 148 |
+
config = DataTransformationConfig()
|
| 149 |
+
transformation = DataTransformation(config)
|
| 150 |
+
artifact = transformation.transform(validated_data_path)
|
| 151 |
+
|
| 152 |
+
# Store artifact in XCom
|
| 153 |
+
context['ti'].xcom_push(key='transformation_artifact', value={
|
| 154 |
+
'feature_store_path': artifact.feature_store_path,
|
| 155 |
+
'language_distribution': artifact.language_distribution,
|
| 156 |
+
'total_records': artifact.total_records
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
return artifact.feature_store_path
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def run_model_training(**context):
|
| 163 |
+
"""Run model training with Optuna and MLflow"""
|
| 164 |
+
from src.components import ModelTrainer
|
| 165 |
+
from src.entity import ModelTrainerConfig
|
| 166 |
+
from datetime import datetime
|
| 167 |
+
|
| 168 |
+
# Get transformation output from XCom
|
| 169 |
+
transformation = context['ti'].xcom_pull(key='transformation_artifact', task_ids='data_transformation')
|
| 170 |
+
feature_path = transformation['feature_store_path']
|
| 171 |
+
|
| 172 |
+
config = ModelTrainerConfig()
|
| 173 |
+
trainer = ModelTrainer(config)
|
| 174 |
+
artifact = trainer.train(feature_path)
|
| 175 |
+
|
| 176 |
+
# Store training timestamp for next run
|
| 177 |
+
context['ti'].xcom_push(key='last_training_timestamp', value=datetime.utcnow().isoformat())
|
| 178 |
+
|
| 179 |
+
# Store artifact in XCom
|
| 180 |
+
context['ti'].xcom_push(key='training_artifact', value={
|
| 181 |
+
'best_model_name': artifact.best_model_name,
|
| 182 |
+
'best_model_path': artifact.best_model_path,
|
| 183 |
+
'mlflow_run_id': artifact.mlflow_run_id,
|
| 184 |
+
'n_anomalies': artifact.n_anomalies
|
| 185 |
+
})
|
| 186 |
+
|
| 187 |
+
return artifact.best_model_path
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# Create DAG
|
| 191 |
+
with DAG(
|
| 192 |
+
'anomaly_detection_training',
|
| 193 |
+
default_args=default_args,
|
| 194 |
+
description='Train anomaly detection models on feed data',
|
| 195 |
+
schedule_interval=timedelta(hours=4), # Check every 4 hours
|
| 196 |
+
start_date=datetime(2024, 1, 1),
|
| 197 |
+
catchup=False,
|
| 198 |
+
tags=['ml', 'anomaly', 'modelx'],
|
| 199 |
+
) as dag:
|
| 200 |
+
|
| 201 |
+
# Start
|
| 202 |
+
start = EmptyOperator(task_id='start')
|
| 203 |
+
|
| 204 |
+
# Sensor: Check for new records
|
| 205 |
+
check_records = PythonSensor(
|
| 206 |
+
task_id='check_new_records',
|
| 207 |
+
python_callable=check_new_records,
|
| 208 |
+
timeout=3600,
|
| 209 |
+
poke_interval=300, # Check every 5 minutes
|
| 210 |
+
mode='poke',
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Data Ingestion
|
| 214 |
+
data_ingestion = PythonOperator(
|
| 215 |
+
task_id='data_ingestion',
|
| 216 |
+
python_callable=run_data_ingestion,
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# Data Validation
|
| 220 |
+
data_validation = PythonOperator(
|
| 221 |
+
task_id='data_validation',
|
| 222 |
+
python_callable=run_data_validation,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Data Transformation
|
| 226 |
+
data_transformation = PythonOperator(
|
| 227 |
+
task_id='data_transformation',
|
| 228 |
+
python_callable=run_data_transformation,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
# Model Training
|
| 232 |
+
model_training = PythonOperator(
|
| 233 |
+
task_id='model_training',
|
| 234 |
+
python_callable=run_model_training,
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# End
|
| 238 |
+
end = EmptyOperator(task_id='end')
|
| 239 |
+
|
| 240 |
+
# Pipeline flow
|
| 241 |
+
start >> check_records >> data_ingestion >> data_validation >> data_transformation >> model_training >> end
|
models/anomaly-detection/data_schema/schema.yaml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Schema for Anomaly Detection Pipeline
|
| 2 |
+
# Based on the feeds dataset created by the combined graph
|
| 3 |
+
|
| 4 |
+
feed_columns:
|
| 5 |
+
# Required columns
|
| 6 |
+
post_id:
|
| 7 |
+
dtype: str
|
| 8 |
+
required: true
|
| 9 |
+
description: "Unique identifier for the post"
|
| 10 |
+
|
| 11 |
+
timestamp:
|
| 12 |
+
dtype: str # ISO format or Unix timestamp
|
| 13 |
+
required: true
|
| 14 |
+
description: "Post timestamp"
|
| 15 |
+
|
| 16 |
+
platform:
|
| 17 |
+
dtype: str
|
| 18 |
+
required: true
|
| 19 |
+
allowed_values: ["reddit", "facebook", "twitter", "linkedin", "instagram", "web"]
|
| 20 |
+
description: "Source platform"
|
| 21 |
+
|
| 22 |
+
category:
|
| 23 |
+
dtype: str
|
| 24 |
+
required: true
|
| 25 |
+
description: "Post category (competitor_mention, profile_monitoring, etc.)"
|
| 26 |
+
|
| 27 |
+
text:
|
| 28 |
+
dtype: str
|
| 29 |
+
required: true
|
| 30 |
+
min_length: 10
|
| 31 |
+
max_length: 5000
|
| 32 |
+
description: "Main text content"
|
| 33 |
+
|
| 34 |
+
content_hash:
|
| 35 |
+
dtype: str
|
| 36 |
+
required: true
|
| 37 |
+
description: "MD5/SHA256 hash for deduplication"
|
| 38 |
+
|
| 39 |
+
# Optional columns
|
| 40 |
+
entity:
|
| 41 |
+
dtype: str
|
| 42 |
+
required: false
|
| 43 |
+
description: "Referenced entity (Dialog, SLT, etc.)"
|
| 44 |
+
|
| 45 |
+
poster:
|
| 46 |
+
dtype: str
|
| 47 |
+
required: false
|
| 48 |
+
description: "Author/poster username"
|
| 49 |
+
|
| 50 |
+
post_url:
|
| 51 |
+
dtype: str
|
| 52 |
+
required: false
|
| 53 |
+
description: "URL to original post"
|
| 54 |
+
|
| 55 |
+
title:
|
| 56 |
+
dtype: str
|
| 57 |
+
required: false
|
| 58 |
+
description: "Post title if available"
|
| 59 |
+
|
| 60 |
+
engagement_score:
|
| 61 |
+
dtype: float
|
| 62 |
+
required: false
|
| 63 |
+
min_value: 0
|
| 64 |
+
description: "Overall engagement score"
|
| 65 |
+
|
| 66 |
+
engagement_likes:
|
| 67 |
+
dtype: int
|
| 68 |
+
required: false
|
| 69 |
+
min_value: 0
|
| 70 |
+
description: "Number of likes"
|
| 71 |
+
|
| 72 |
+
engagement_shares:
|
| 73 |
+
dtype: int
|
| 74 |
+
required: false
|
| 75 |
+
min_value: 0
|
| 76 |
+
description: "Number of shares"
|
| 77 |
+
|
| 78 |
+
engagement_comments:
|
| 79 |
+
dtype: int
|
| 80 |
+
required: false
|
| 81 |
+
min_value: 0
|
| 82 |
+
description: "Number of comments"
|
| 83 |
+
|
| 84 |
+
source_tool:
|
| 85 |
+
dtype: str
|
| 86 |
+
required: false
|
| 87 |
+
description: "Tool used for scraping (scrape_reddit, scrape_facebook_profile, etc.)"
|
| 88 |
+
|
| 89 |
+
# SQLite schema for feed cache
|
| 90 |
+
sqlite_schema:
|
| 91 |
+
table: seen_hashes
|
| 92 |
+
columns:
|
| 93 |
+
- content_hash: TEXT PRIMARY KEY
|
| 94 |
+
- first_seen: TIMESTAMP
|
| 95 |
+
- last_seen: TIMESTAMP
|
| 96 |
+
- event_id: TEXT
|
| 97 |
+
- summary_preview: TEXT
|
| 98 |
+
|
| 99 |
+
# Feature engineering configuration
|
| 100 |
+
features:
|
| 101 |
+
temporal:
|
| 102 |
+
- hour_of_day
|
| 103 |
+
- day_of_week
|
| 104 |
+
- is_weekend
|
| 105 |
+
- is_business_hours
|
| 106 |
+
|
| 107 |
+
engagement:
|
| 108 |
+
- normalized_score
|
| 109 |
+
- log_engagement
|
| 110 |
+
- engagement_ratio
|
| 111 |
+
|
| 112 |
+
text:
|
| 113 |
+
- language # en, si, ta
|
| 114 |
+
- vector_embedding # 768-dim for BERT models
|
| 115 |
+
- text_length
|
| 116 |
+
- word_count
|
| 117 |
+
|
| 118 |
+
# Language detection configuration
|
| 119 |
+
languages:
|
| 120 |
+
supported:
|
| 121 |
+
- code: en
|
| 122 |
+
name: English
|
| 123 |
+
model: distilbert-base-uncased
|
| 124 |
+
- code: si
|
| 125 |
+
name: Sinhala
|
| 126 |
+
model: keshan/SinhalaBERTo
|
| 127 |
+
- code: ta
|
| 128 |
+
name: Tamil
|
| 129 |
+
model: l3cube-pune/tamil-bert
|
| 130 |
+
|
| 131 |
+
detection:
|
| 132 |
+
method: fasttext # or lingua-py
|
| 133 |
+
min_confidence: 0.8
|
models/anomaly-detection/download_models.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/download_models.py
|
| 3 |
+
Script to pre-download all required models for the pipeline.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import requests
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
# Add src to path
|
| 13 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
| 17 |
+
logger = logging.getLogger("downloader")
|
| 18 |
+
|
| 19 |
+
# Constants
|
| 20 |
+
CACHE_DIR = Path(__file__).parent / "models_cache"
|
| 21 |
+
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
|
| 22 |
+
FASTTEXT_PATH = CACHE_DIR / "lid.176.bin"
|
| 23 |
+
|
| 24 |
+
def download_file(url, destination):
|
| 25 |
+
"""Download file with progress bar"""
|
| 26 |
+
response = requests.get(url, stream=True)
|
| 27 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 28 |
+
|
| 29 |
+
with open(destination, 'wb') as file, tqdm(
|
| 30 |
+
desc=destination.name,
|
| 31 |
+
total=total_size,
|
| 32 |
+
unit='iB',
|
| 33 |
+
unit_scale=True,
|
| 34 |
+
unit_divisor=1024,
|
| 35 |
+
) as bar:
|
| 36 |
+
for data in response.iter_content(chunk_size=1024):
|
| 37 |
+
size = file.write(data)
|
| 38 |
+
bar.update(size)
|
| 39 |
+
|
| 40 |
+
def main():
|
| 41 |
+
logger.info("=" * 50)
|
| 42 |
+
logger.info("⬇️ MODEL DOWNLOADER")
|
| 43 |
+
logger.info("=" * 50)
|
| 44 |
+
|
| 45 |
+
# Ensure cache directory exists
|
| 46 |
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
logger.info(f"📂 Cache Directory: {CACHE_DIR}")
|
| 48 |
+
|
| 49 |
+
# 1. Download FastText Model
|
| 50 |
+
logger.info("\n[1/2] Checking FastText Model (Language Detection)...")
|
| 51 |
+
if not FASTTEXT_PATH.exists():
|
| 52 |
+
logger.info(f" Downloading lid.176.bin...")
|
| 53 |
+
try:
|
| 54 |
+
download_file(FASTTEXT_URL, FASTTEXT_PATH)
|
| 55 |
+
logger.info(" ✅ Download complete")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.error(f" ❌ Failed to download FastText: {e}")
|
| 58 |
+
else:
|
| 59 |
+
logger.info(" ✅ FastText model already exists")
|
| 60 |
+
|
| 61 |
+
# 2. Download HuggingFace Models
|
| 62 |
+
logger.info("\n[2/2] Checking HuggingFace BERT Models (Vectorization)...")
|
| 63 |
+
try:
|
| 64 |
+
from src.utils.vectorizer import get_vectorizer
|
| 65 |
+
|
| 66 |
+
# Initialize vectorizer which handles HF downloads
|
| 67 |
+
logger.info(" Initializing vectorizer to trigger downloads...")
|
| 68 |
+
vectorizer = get_vectorizer(models_cache_dir=str(CACHE_DIR))
|
| 69 |
+
|
| 70 |
+
# Trigger downloads for all languages
|
| 71 |
+
vectorizer.download_all_models()
|
| 72 |
+
|
| 73 |
+
logger.info(" ✅ All BERT models ready")
|
| 74 |
+
|
| 75 |
+
except ImportError:
|
| 76 |
+
logger.error(" ❌ Could not import vectorizer. Install requirements first:")
|
| 77 |
+
logger.error(" pip install -r requirements.txt")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f" ❌ Error downloading BERT models: {e}")
|
| 80 |
+
|
| 81 |
+
logger.info("\n" + "=" * 50)
|
| 82 |
+
logger.info("✨ SETUP COMPLETE")
|
| 83 |
+
logger.info("=" * 50)
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
main()
|
models/anomaly-detection/main.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/main.py
|
| 3 |
+
Entry point for the anomaly detection training pipeline
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add src to path
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
| 12 |
+
|
| 13 |
+
from src.pipeline import run_training_pipeline
|
| 14 |
+
from src.entity import PipelineConfig
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 20 |
+
handlers=[
|
| 21 |
+
logging.StreamHandler(),
|
| 22 |
+
logging.FileHandler("training.log")
|
| 23 |
+
]
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("main")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
"""Run the anomaly detection training pipeline"""
|
| 31 |
+
logger.info("=" * 60)
|
| 32 |
+
logger.info("ANOMALY DETECTION PIPELINE")
|
| 33 |
+
logger.info("=" * 60)
|
| 34 |
+
|
| 35 |
+
# Load environment variables
|
| 36 |
+
from dotenv import load_dotenv
|
| 37 |
+
load_dotenv()
|
| 38 |
+
|
| 39 |
+
# Create configuration
|
| 40 |
+
config = PipelineConfig()
|
| 41 |
+
|
| 42 |
+
# Run pipeline
|
| 43 |
+
try:
|
| 44 |
+
artifact = run_training_pipeline(config)
|
| 45 |
+
|
| 46 |
+
logger.info("\n" + "=" * 60)
|
| 47 |
+
logger.info("PIPELINE RESULTS")
|
| 48 |
+
logger.info("=" * 60)
|
| 49 |
+
logger.info(f"Status: {artifact.pipeline_status}")
|
| 50 |
+
logger.info(f"Run ID: {artifact.pipeline_run_id}")
|
| 51 |
+
logger.info(f"Duration: {artifact.pipeline_start_time} to {artifact.pipeline_end_time}")
|
| 52 |
+
|
| 53 |
+
logger.info("\n--- Data Ingestion ---")
|
| 54 |
+
logger.info(f"Total records: {artifact.data_ingestion.total_records}")
|
| 55 |
+
logger.info(f"From SQLite: {artifact.data_ingestion.records_from_sqlite}")
|
| 56 |
+
logger.info(f"From CSV: {artifact.data_ingestion.records_from_csv}")
|
| 57 |
+
|
| 58 |
+
logger.info("\n--- Data Validation ---")
|
| 59 |
+
logger.info(f"Valid records: {artifact.data_validation.valid_records}")
|
| 60 |
+
logger.info(f"Validation status: {artifact.data_validation.validation_status}")
|
| 61 |
+
|
| 62 |
+
logger.info("\n--- Data Transformation ---")
|
| 63 |
+
logger.info(f"Language distribution: {artifact.data_transformation.language_distribution}")
|
| 64 |
+
|
| 65 |
+
logger.info("\n--- Model Training ---")
|
| 66 |
+
logger.info(f"Best model: {artifact.model_trainer.best_model_name}")
|
| 67 |
+
logger.info(f"Best metrics: {artifact.model_trainer.best_model_metrics}")
|
| 68 |
+
logger.info(f"MLflow run: {artifact.model_trainer.mlflow_run_id}")
|
| 69 |
+
|
| 70 |
+
if artifact.model_trainer.n_anomalies:
|
| 71 |
+
logger.info(f"Anomalies detected: {artifact.model_trainer.n_anomalies}")
|
| 72 |
+
|
| 73 |
+
logger.info("\n" + "=" * 60)
|
| 74 |
+
logger.info("PIPELINE COMPLETE")
|
| 75 |
+
logger.info("=" * 60)
|
| 76 |
+
|
| 77 |
+
return artifact
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Pipeline failed: {e}")
|
| 81 |
+
raise
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
models/anomaly-detection/packages.txt
ADDED
|
File without changes
|
models/anomaly-detection/requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Anomaly Detection Pipeline - Requirements
|
| 2 |
+
|
| 3 |
+
# ML & Clustering
|
| 4 |
+
optuna>=3.0
|
| 5 |
+
scikit-learn>=1.0
|
| 6 |
+
hdbscan>=0.8.29
|
| 7 |
+
umap-learn>=0.5
|
| 8 |
+
|
| 9 |
+
# Language Detection
|
| 10 |
+
fasttext-wheel>=0.9.2
|
| 11 |
+
lingua-language-detector>=2.0
|
| 12 |
+
|
| 13 |
+
# NLP & Transformers
|
| 14 |
+
transformers>=4.30
|
| 15 |
+
sentence-transformers>=2.0
|
| 16 |
+
torch>=2.0
|
| 17 |
+
|
| 18 |
+
# MLflow & Tracking
|
| 19 |
+
mlflow>=2.0
|
| 20 |
+
dagshub>=0.3
|
| 21 |
+
|
| 22 |
+
# Database
|
| 23 |
+
pymongo>=4.0
|
| 24 |
+
|
| 25 |
+
# Data Processing
|
| 26 |
+
pandas>=2.0
|
| 27 |
+
numpy>=1.24
|
| 28 |
+
pyyaml>=6.0
|
| 29 |
+
|
| 30 |
+
# Airflow (managed by Astro)
|
| 31 |
+
# apache-airflow>=2.7
|
| 32 |
+
|
| 33 |
+
# Utilities
|
| 34 |
+
joblib>=1.3
|
| 35 |
+
tqdm>=4.65
|
| 36 |
+
|
| 37 |
+
astro-run-dag # This package is needed for the astro run command. It will be removed before a deploy
|
models/anomaly-detection/src/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/__init__.py
|
| 3 |
+
Anomaly Detection Pipeline Package
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .components.data_ingestion import DataIngestion
|
| 7 |
+
from .components.data_validation import DataValidation
|
| 8 |
+
from .components.data_transformation import DataTransformation
|
| 9 |
+
from .components.model_trainer import ModelTrainer
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"DataIngestion",
|
| 13 |
+
"DataValidation",
|
| 14 |
+
"DataTransformation",
|
| 15 |
+
"ModelTrainer"
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
__version__ = "1.0.0"
|
models/anomaly-detection/src/components/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/components/__init__.py
|
| 3 |
+
|
| 4 |
+
Sets up paths for integration with main project before importing components.
|
| 5 |
+
"""
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Add main project root to path for vectorization agent graph access
|
| 10 |
+
# Path: models/anomaly-detection/src/components/__init__.py -> go up 4 levels to ModelX-Ultimate
|
| 11 |
+
# Note: This is secondary to anomaly-detection path. Direct graph import won't work
|
| 12 |
+
# due to 'src' namespace collision. Use VectorizationAPI HTTP calls instead.
|
| 13 |
+
_main_project_root = Path(__file__).parent.parent.parent.parent.parent
|
| 14 |
+
_main_path = str(_main_project_root)
|
| 15 |
+
if _main_path not in sys.path:
|
| 16 |
+
sys.path.append(_main_path) # Append, don't insert at 0
|
| 17 |
+
|
| 18 |
+
from .data_ingestion import DataIngestion
|
| 19 |
+
from .data_validation import DataValidation
|
| 20 |
+
from .data_transformation import DataTransformation
|
| 21 |
+
from .model_trainer import ModelTrainer
|
| 22 |
+
|
| 23 |
+
__all__ = [
|
| 24 |
+
"DataIngestion",
|
| 25 |
+
"DataValidation",
|
| 26 |
+
"DataTransformation",
|
| 27 |
+
"ModelTrainer"
|
| 28 |
+
]
|
| 29 |
+
|
models/anomaly-detection/src/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/components/data_ingestion.py
|
| 3 |
+
Data ingestion from SQLite feed cache and CSV files
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sqlite3
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
from ..entity import DataIngestionConfig, DataIngestionArtifact
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger("data_ingestion")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DataIngestion:
|
| 19 |
+
"""
|
| 20 |
+
Data ingestion component that fetches feed data from:
|
| 21 |
+
1. SQLite database (feed_cache.db) - production deduped feeds
|
| 22 |
+
2. CSV files in datasets/political_feeds/ - historical data
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, config: Optional[DataIngestionConfig] = None):
|
| 26 |
+
"""
|
| 27 |
+
Initialize data ingestion component.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
config: Optional configuration, uses defaults if None
|
| 31 |
+
"""
|
| 32 |
+
self.config = config or DataIngestionConfig()
|
| 33 |
+
|
| 34 |
+
# Ensure output directory exists
|
| 35 |
+
Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
logger.info(f"[DataIngestion] Initialized")
|
| 38 |
+
logger.info(f" SQLite: {self.config.sqlite_db_path}")
|
| 39 |
+
logger.info(f" CSV Dir: {self.config.csv_directory}")
|
| 40 |
+
logger.info(f" Output: {self.config.output_directory}")
|
| 41 |
+
|
| 42 |
+
def _fetch_from_sqlite(self) -> pd.DataFrame:
|
| 43 |
+
"""
|
| 44 |
+
Fetch feed data from SQLite cache database.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
DataFrame with feed records
|
| 48 |
+
"""
|
| 49 |
+
db_path = self.config.sqlite_db_path
|
| 50 |
+
|
| 51 |
+
if not os.path.exists(db_path):
|
| 52 |
+
logger.warning(f"[DataIngestion] SQLite DB not found: {db_path}")
|
| 53 |
+
return pd.DataFrame()
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
conn = sqlite3.connect(db_path)
|
| 57 |
+
|
| 58 |
+
# Query the seen_hashes table
|
| 59 |
+
query = """
|
| 60 |
+
SELECT
|
| 61 |
+
content_hash as post_id,
|
| 62 |
+
first_seen as timestamp,
|
| 63 |
+
event_id,
|
| 64 |
+
summary_preview as text
|
| 65 |
+
FROM seen_hashes
|
| 66 |
+
ORDER BY last_seen DESC
|
| 67 |
+
"""
|
| 68 |
+
df = pd.read_sql_query(query, conn)
|
| 69 |
+
conn.close()
|
| 70 |
+
|
| 71 |
+
# Add default columns for compatibility
|
| 72 |
+
if not df.empty:
|
| 73 |
+
df["platform"] = "mixed"
|
| 74 |
+
df["category"] = "feed"
|
| 75 |
+
df["content_hash"] = df["post_id"]
|
| 76 |
+
df["source"] = "sqlite"
|
| 77 |
+
|
| 78 |
+
logger.info(f"[DataIngestion] Fetched {len(df)} records from SQLite")
|
| 79 |
+
return df
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"[DataIngestion] SQLite error: {e}")
|
| 83 |
+
return pd.DataFrame()
|
| 84 |
+
|
| 85 |
+
def _fetch_from_csv(self) -> pd.DataFrame:
|
| 86 |
+
"""
|
| 87 |
+
Fetch feed data from CSV files in datasets directory.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Combined DataFrame from all CSV files
|
| 91 |
+
"""
|
| 92 |
+
csv_dir = Path(self.config.csv_directory)
|
| 93 |
+
|
| 94 |
+
if not csv_dir.exists():
|
| 95 |
+
logger.warning(f"[DataIngestion] CSV directory not found: {csv_dir}")
|
| 96 |
+
return pd.DataFrame()
|
| 97 |
+
|
| 98 |
+
all_dfs = []
|
| 99 |
+
csv_files = list(csv_dir.glob("*.csv"))
|
| 100 |
+
|
| 101 |
+
for csv_file in csv_files:
|
| 102 |
+
try:
|
| 103 |
+
df = pd.read_csv(csv_file)
|
| 104 |
+
df["source_file"] = csv_file.name
|
| 105 |
+
df["source"] = "csv"
|
| 106 |
+
all_dfs.append(df)
|
| 107 |
+
logger.info(f"[DataIngestion] Loaded {len(df)} records from {csv_file.name}")
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.warning(f"[DataIngestion] Failed to load {csv_file}: {e}")
|
| 110 |
+
|
| 111 |
+
if not all_dfs:
|
| 112 |
+
return pd.DataFrame()
|
| 113 |
+
|
| 114 |
+
combined = pd.concat(all_dfs, ignore_index=True)
|
| 115 |
+
logger.info(f"[DataIngestion] Total {len(combined)} records from {len(csv_files)} CSV files")
|
| 116 |
+
return combined
|
| 117 |
+
|
| 118 |
+
def _deduplicate(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 119 |
+
"""
|
| 120 |
+
Remove duplicate records based on content_hash.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
df: Input DataFrame
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Deduplicated DataFrame
|
| 127 |
+
"""
|
| 128 |
+
if df.empty:
|
| 129 |
+
return df
|
| 130 |
+
|
| 131 |
+
initial_count = len(df)
|
| 132 |
+
|
| 133 |
+
# Use content_hash for deduplication, fallback to post_id
|
| 134 |
+
if "content_hash" in df.columns:
|
| 135 |
+
df = df.drop_duplicates(subset=["content_hash"], keep="first")
|
| 136 |
+
elif "post_id" in df.columns:
|
| 137 |
+
df = df.drop_duplicates(subset=["post_id"], keep="first")
|
| 138 |
+
|
| 139 |
+
deduped_count = len(df)
|
| 140 |
+
removed = initial_count - deduped_count
|
| 141 |
+
|
| 142 |
+
if removed > 0:
|
| 143 |
+
logger.info(f"[DataIngestion] Deduplicated: removed {removed} duplicates")
|
| 144 |
+
|
| 145 |
+
return df
|
| 146 |
+
|
| 147 |
+
def _filter_valid_records(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 148 |
+
"""
|
| 149 |
+
Filter records with sufficient text content.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
df: Input DataFrame
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Filtered DataFrame
|
| 156 |
+
"""
|
| 157 |
+
if df.empty:
|
| 158 |
+
return df
|
| 159 |
+
|
| 160 |
+
initial_count = len(df)
|
| 161 |
+
|
| 162 |
+
# Ensure text column exists
|
| 163 |
+
if "text" not in df.columns:
|
| 164 |
+
# Try alternative column names
|
| 165 |
+
text_cols = ["summary_preview", "title", "content"]
|
| 166 |
+
for col in text_cols:
|
| 167 |
+
if col in df.columns:
|
| 168 |
+
df["text"] = df[col]
|
| 169 |
+
break
|
| 170 |
+
|
| 171 |
+
if "text" not in df.columns:
|
| 172 |
+
logger.warning("[DataIngestion] No text column found")
|
| 173 |
+
df["text"] = ""
|
| 174 |
+
|
| 175 |
+
# Filter by minimum text length
|
| 176 |
+
df = df[df["text"].str.len() >= self.config.min_text_length]
|
| 177 |
+
|
| 178 |
+
filtered_count = len(df)
|
| 179 |
+
removed = initial_count - filtered_count
|
| 180 |
+
|
| 181 |
+
if removed > 0:
|
| 182 |
+
logger.info(f"[DataIngestion] Filtered: removed {removed} short texts")
|
| 183 |
+
|
| 184 |
+
return df
|
| 185 |
+
|
| 186 |
+
def ingest(self) -> DataIngestionArtifact:
|
| 187 |
+
"""
|
| 188 |
+
Execute data ingestion pipeline.
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
DataIngestionArtifact with paths and statistics
|
| 192 |
+
"""
|
| 193 |
+
logger.info("[DataIngestion] Starting data ingestion...")
|
| 194 |
+
|
| 195 |
+
# Fetch from both sources
|
| 196 |
+
sqlite_df = self._fetch_from_sqlite()
|
| 197 |
+
csv_df = self._fetch_from_csv()
|
| 198 |
+
|
| 199 |
+
records_from_sqlite = len(sqlite_df)
|
| 200 |
+
records_from_csv = len(csv_df)
|
| 201 |
+
|
| 202 |
+
# Combine sources
|
| 203 |
+
if not sqlite_df.empty and not csv_df.empty:
|
| 204 |
+
# Ensure compatible columns
|
| 205 |
+
common_cols = list(set(sqlite_df.columns) & set(csv_df.columns))
|
| 206 |
+
combined_df = pd.concat([
|
| 207 |
+
sqlite_df[common_cols],
|
| 208 |
+
csv_df[common_cols]
|
| 209 |
+
], ignore_index=True)
|
| 210 |
+
elif not sqlite_df.empty:
|
| 211 |
+
combined_df = sqlite_df
|
| 212 |
+
elif not csv_df.empty:
|
| 213 |
+
combined_df = csv_df
|
| 214 |
+
else:
|
| 215 |
+
combined_df = pd.DataFrame()
|
| 216 |
+
|
| 217 |
+
# Deduplicate
|
| 218 |
+
combined_df = self._deduplicate(combined_df)
|
| 219 |
+
|
| 220 |
+
# Filter valid records
|
| 221 |
+
combined_df = self._filter_valid_records(combined_df)
|
| 222 |
+
|
| 223 |
+
total_records = len(combined_df)
|
| 224 |
+
is_data_available = total_records > 0
|
| 225 |
+
|
| 226 |
+
# Save to output
|
| 227 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 228 |
+
output_path = Path(self.config.output_directory) / f"ingested_data_{timestamp}.parquet"
|
| 229 |
+
|
| 230 |
+
if is_data_available:
|
| 231 |
+
combined_df.to_parquet(output_path, index=False)
|
| 232 |
+
logger.info(f"[DataIngestion] Saved {total_records} records to {output_path}")
|
| 233 |
+
else:
|
| 234 |
+
output_path = str(output_path)
|
| 235 |
+
logger.warning("[DataIngestion] No data available to save")
|
| 236 |
+
|
| 237 |
+
artifact = DataIngestionArtifact(
|
| 238 |
+
raw_data_path=str(output_path),
|
| 239 |
+
total_records=total_records,
|
| 240 |
+
records_from_sqlite=records_from_sqlite,
|
| 241 |
+
records_from_csv=records_from_csv,
|
| 242 |
+
ingestion_timestamp=timestamp,
|
| 243 |
+
is_data_available=is_data_available
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
logger.info(f"[DataIngestion] ✓ Complete: {total_records} records")
|
| 247 |
+
return artifact
|
models/anomaly-detection/src/components/data_transformation.py
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/components/data_transformation.py
|
| 3 |
+
Data transformation with language detection and text vectorization
|
| 4 |
+
Integrates with Vectorization Agent Graph for LLM-enhanced processing
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Optional, Dict, Any, List
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
|
| 15 |
+
from ..entity import DataTransformationConfig, DataTransformationArtifact
|
| 16 |
+
from ..utils import detect_language, get_vectorizer
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger("data_transformation")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DataTransformation:
|
| 22 |
+
"""
|
| 23 |
+
Data transformation component that:
|
| 24 |
+
1. Detects language (Sinhala/Tamil/English)
|
| 25 |
+
2. Extracts text embeddings using language-specific BERT models
|
| 26 |
+
3. Engineers temporal and engagement features
|
| 27 |
+
4. Optionally integrates with Vectorizer Agent Graph for LLM insights
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, config: Optional[DataTransformationConfig] = None, use_agent_graph: bool = True):
|
| 31 |
+
"""
|
| 32 |
+
Initialize data transformation component.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
config: Optional configuration, uses defaults if None
|
| 36 |
+
use_agent_graph: If True, use vectorizer agent graph for processing
|
| 37 |
+
"""
|
| 38 |
+
self.config = config or DataTransformationConfig()
|
| 39 |
+
self.use_agent_graph = use_agent_graph
|
| 40 |
+
|
| 41 |
+
# Ensure output directory exists
|
| 42 |
+
Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
# Get vectorizer (lazy loaded)
|
| 45 |
+
self.vectorizer = get_vectorizer(self.config.models_cache_dir)
|
| 46 |
+
|
| 47 |
+
# Vectorization API integration
|
| 48 |
+
# Note: Direct import of vectorizationAgentGraph fails due to 'src' namespace collision
|
| 49 |
+
# between this project (models/anomaly-detection/src) and main project (src).
|
| 50 |
+
# Instead, we call the Vectorization API via HTTP when available.
|
| 51 |
+
self.vectorizer_graph = None # Not used - we use HTTP API instead
|
| 52 |
+
self.vectorization_api_url = os.getenv("VECTORIZATION_API_URL", "http://localhost:8001")
|
| 53 |
+
self.vectorization_api_available = False
|
| 54 |
+
|
| 55 |
+
if self.use_agent_graph:
|
| 56 |
+
# Check if vectorization API is available
|
| 57 |
+
try:
|
| 58 |
+
import requests
|
| 59 |
+
response = requests.get(f"{self.vectorization_api_url}/health", timeout=10)
|
| 60 |
+
if response.status_code == 200:
|
| 61 |
+
self.vectorization_api_available = True
|
| 62 |
+
logger.info(f"[DataTransformation] [OK] Vectorization API available at {self.vectorization_api_url}")
|
| 63 |
+
else:
|
| 64 |
+
logger.warning(f"[DataTransformation] Vectorization API returned status {response.status_code}")
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.warning(f"[DataTransformation] Vectorization API not available: {e}")
|
| 67 |
+
logger.info("[DataTransformation] Using local vectorization (no LLM insights)")
|
| 68 |
+
|
| 69 |
+
logger.info(f"[DataTransformation] Initialized")
|
| 70 |
+
logger.info(f" Models cache: {self.config.models_cache_dir}")
|
| 71 |
+
logger.info(f" Vectorization API: {'enabled' if self.vectorization_api_available else 'disabled (using local)'}")
|
| 72 |
+
|
| 73 |
+
def _process_with_agent_graph(self, texts: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 74 |
+
"""
|
| 75 |
+
Process texts through the Vectorization API.
|
| 76 |
+
|
| 77 |
+
Uses HTTP calls to the vectorization API server which runs the
|
| 78 |
+
Vectorizer Agent Graph. This avoids the 'src' namespace collision.
|
| 79 |
+
|
| 80 |
+
This provides:
|
| 81 |
+
- Language detection
|
| 82 |
+
- Vector embeddings
|
| 83 |
+
- LLM expert summary
|
| 84 |
+
- Opportunity/threat analysis
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
texts: List of {text, post_id, metadata} dicts
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Dict with language_detection_results, vector_embeddings, expert_summary, etc.
|
| 91 |
+
"""
|
| 92 |
+
if not self.vectorization_api_available:
|
| 93 |
+
logger.warning("[DataTransformation] Vectorization API not available, using fallback")
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
import requests
|
| 98 |
+
|
| 99 |
+
batch_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 100 |
+
|
| 101 |
+
# Prepare request payload
|
| 102 |
+
payload = {
|
| 103 |
+
"texts": [
|
| 104 |
+
{
|
| 105 |
+
"text": item.get("text", ""),
|
| 106 |
+
"post_id": item.get("post_id", f"text_{i}"),
|
| 107 |
+
"metadata": item.get("metadata", {})
|
| 108 |
+
}
|
| 109 |
+
for i, item in enumerate(texts)
|
| 110 |
+
],
|
| 111 |
+
"batch_id": batch_id,
|
| 112 |
+
"include_vectors": True,
|
| 113 |
+
"include_expert_summary": True
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# Call vectorization API
|
| 117 |
+
response = requests.post(
|
| 118 |
+
f"{self.vectorization_api_url}/vectorize",
|
| 119 |
+
json=payload,
|
| 120 |
+
timeout=120 # 2 minutes for large batches
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
if response.status_code == 200:
|
| 124 |
+
result = response.json()
|
| 125 |
+
logger.info(f"[DataTransformation] Vectorization API processed {len(texts)} texts")
|
| 126 |
+
|
| 127 |
+
# Convert API response to expected format
|
| 128 |
+
return {
|
| 129 |
+
"language_detection_results": result.get("vectors", []),
|
| 130 |
+
"vector_embeddings": result.get("vectors", []),
|
| 131 |
+
"expert_summary": result.get("expert_summary", ""),
|
| 132 |
+
"opportunities": [], # Extracted from domain_insights
|
| 133 |
+
"threats": [], # Extracted from domain_insights
|
| 134 |
+
"domain_insights": result.get("domain_insights", []),
|
| 135 |
+
"processing_stats": {
|
| 136 |
+
"language_distribution": result.get("language_distribution", {}),
|
| 137 |
+
"processing_time": result.get("processing_time_seconds", 0)
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
else:
|
| 141 |
+
logger.error(f"[DataTransformation] Vectorization API error: {response.status_code}")
|
| 142 |
+
return None
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"[DataTransformation] Vectorization API call failed: {e}")
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
def _detect_languages(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 149 |
+
"""
|
| 150 |
+
Detect language for each text entry.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
df: Input DataFrame with 'text' column
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
DataFrame with 'language' and 'language_confidence' columns
|
| 157 |
+
"""
|
| 158 |
+
logger.info("[DataTransformation] Detecting languages...")
|
| 159 |
+
|
| 160 |
+
languages = []
|
| 161 |
+
confidences = []
|
| 162 |
+
|
| 163 |
+
for text in tqdm(df["text"].fillna(""), desc="Language Detection"):
|
| 164 |
+
lang, conf = detect_language(text)
|
| 165 |
+
languages.append(lang)
|
| 166 |
+
confidences.append(conf)
|
| 167 |
+
|
| 168 |
+
df["language"] = languages
|
| 169 |
+
df["language_confidence"] = confidences
|
| 170 |
+
|
| 171 |
+
# Log distribution
|
| 172 |
+
lang_counts = df["language"].value_counts()
|
| 173 |
+
logger.info(f"[DataTransformation] Language distribution:")
|
| 174 |
+
for lang, count in lang_counts.items():
|
| 175 |
+
logger.info(f" {lang}: {count} ({100*count/len(df):.1f}%)")
|
| 176 |
+
|
| 177 |
+
return df
|
| 178 |
+
|
| 179 |
+
def _extract_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 180 |
+
"""
|
| 181 |
+
Extract temporal features from timestamp.
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
df: Input DataFrame with 'timestamp' column
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
DataFrame with temporal feature columns
|
| 188 |
+
"""
|
| 189 |
+
logger.info("[DataTransformation] Extracting temporal features...")
|
| 190 |
+
|
| 191 |
+
if "timestamp" not in df.columns:
|
| 192 |
+
logger.warning("[DataTransformation] No timestamp column found")
|
| 193 |
+
return df
|
| 194 |
+
|
| 195 |
+
# Convert to datetime
|
| 196 |
+
try:
|
| 197 |
+
df["datetime"] = pd.to_datetime(df["timestamp"], errors='coerce')
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.warning(f"[DataTransformation] Timestamp conversion error: {e}")
|
| 200 |
+
return df
|
| 201 |
+
|
| 202 |
+
# Extract features
|
| 203 |
+
df["hour_of_day"] = df["datetime"].dt.hour.fillna(0).astype(int)
|
| 204 |
+
df["day_of_week"] = df["datetime"].dt.dayofweek.fillna(0).astype(int)
|
| 205 |
+
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
|
| 206 |
+
df["is_business_hours"] = ((df["hour_of_day"] >= 9) & (df["hour_of_day"] <= 17)).astype(int)
|
| 207 |
+
|
| 208 |
+
# Drop intermediate column
|
| 209 |
+
df = df.drop(columns=["datetime"], errors='ignore')
|
| 210 |
+
|
| 211 |
+
return df
|
| 212 |
+
|
| 213 |
+
def _extract_engagement_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 214 |
+
"""
|
| 215 |
+
Extract and normalize engagement features.
|
| 216 |
+
|
| 217 |
+
Args:
|
| 218 |
+
df: Input DataFrame
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
DataFrame with engagement feature columns
|
| 222 |
+
"""
|
| 223 |
+
logger.info("[DataTransformation] Extracting engagement features...")
|
| 224 |
+
|
| 225 |
+
# Check for engagement columns
|
| 226 |
+
engagement_cols = ["engagement_score", "engagement_likes", "engagement_shares", "engagement_comments"]
|
| 227 |
+
|
| 228 |
+
for col in engagement_cols:
|
| 229 |
+
if col not in df.columns:
|
| 230 |
+
df[col] = 0
|
| 231 |
+
|
| 232 |
+
# Combined engagement score
|
| 233 |
+
df["total_engagement"] = (
|
| 234 |
+
df["engagement_likes"].fillna(0) +
|
| 235 |
+
df["engagement_shares"].fillna(0) * 2 + # Shares weighted more
|
| 236 |
+
df["engagement_comments"].fillna(0)
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
# Log transform for better distribution
|
| 240 |
+
df["log_engagement"] = np.log1p(df["total_engagement"])
|
| 241 |
+
|
| 242 |
+
# Normalize to 0-1 range
|
| 243 |
+
max_engagement = df["total_engagement"].max()
|
| 244 |
+
if max_engagement > 0:
|
| 245 |
+
df["normalized_engagement"] = df["total_engagement"] / max_engagement
|
| 246 |
+
else:
|
| 247 |
+
df["normalized_engagement"] = 0
|
| 248 |
+
|
| 249 |
+
return df
|
| 250 |
+
|
| 251 |
+
def _extract_text_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 252 |
+
"""
|
| 253 |
+
Extract basic text features.
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
df: Input DataFrame with 'text' column
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
DataFrame with text feature columns
|
| 260 |
+
"""
|
| 261 |
+
logger.info("[DataTransformation] Extracting text features...")
|
| 262 |
+
|
| 263 |
+
df["text_length"] = df["text"].fillna("").str.len()
|
| 264 |
+
df["word_count"] = df["text"].fillna("").str.split().str.len().fillna(0).astype(int)
|
| 265 |
+
|
| 266 |
+
return df
|
| 267 |
+
|
| 268 |
+
def _vectorize_texts(self, df: pd.DataFrame) -> np.ndarray:
|
| 269 |
+
"""
|
| 270 |
+
Vectorize texts using language-specific BERT models.
|
| 271 |
+
|
| 272 |
+
Args:
|
| 273 |
+
df: Input DataFrame with 'text' and 'language' columns
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
numpy array of shape (n_samples, 768)
|
| 277 |
+
"""
|
| 278 |
+
logger.info("[DataTransformation] Vectorizing texts with BERT models...")
|
| 279 |
+
|
| 280 |
+
embeddings = []
|
| 281 |
+
|
| 282 |
+
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Text Vectorization"):
|
| 283 |
+
text = row.get("text", "")
|
| 284 |
+
language = row.get("language", "english")
|
| 285 |
+
|
| 286 |
+
try:
|
| 287 |
+
embedding = self.vectorizer.vectorize(text, language)
|
| 288 |
+
embeddings.append(embedding)
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.debug(f"Vectorization error at {idx}: {e}")
|
| 291 |
+
embeddings.append(np.zeros(self.config.vector_dim))
|
| 292 |
+
|
| 293 |
+
return np.array(embeddings)
|
| 294 |
+
|
| 295 |
+
def _build_feature_matrix(self, df: pd.DataFrame, embeddings: np.ndarray) -> np.ndarray:
|
| 296 |
+
"""
|
| 297 |
+
Combine all features into a single feature matrix.
|
| 298 |
+
|
| 299 |
+
Args:
|
| 300 |
+
df: DataFrame with engineered features
|
| 301 |
+
embeddings: Text embeddings array
|
| 302 |
+
|
| 303 |
+
Returns:
|
| 304 |
+
Combined feature matrix
|
| 305 |
+
"""
|
| 306 |
+
logger.info("[DataTransformation] Building feature matrix...")
|
| 307 |
+
|
| 308 |
+
# Numeric features to include
|
| 309 |
+
numeric_cols = [
|
| 310 |
+
"hour_of_day", "day_of_week", "is_weekend", "is_business_hours",
|
| 311 |
+
"log_engagement", "normalized_engagement",
|
| 312 |
+
"text_length", "word_count"
|
| 313 |
+
]
|
| 314 |
+
|
| 315 |
+
# Filter to available columns
|
| 316 |
+
available_cols = [col for col in numeric_cols if col in df.columns]
|
| 317 |
+
|
| 318 |
+
if available_cols:
|
| 319 |
+
numeric_features = df[available_cols].fillna(0).values
|
| 320 |
+
# Normalize numeric features
|
| 321 |
+
from sklearn.preprocessing import StandardScaler
|
| 322 |
+
scaler = StandardScaler()
|
| 323 |
+
numeric_features = scaler.fit_transform(numeric_features)
|
| 324 |
+
else:
|
| 325 |
+
numeric_features = np.zeros((len(df), 1))
|
| 326 |
+
|
| 327 |
+
# Combine with embeddings
|
| 328 |
+
feature_matrix = np.hstack([embeddings, numeric_features])
|
| 329 |
+
|
| 330 |
+
logger.info(f"[DataTransformation] Feature matrix shape: {feature_matrix.shape}")
|
| 331 |
+
return feature_matrix
|
| 332 |
+
|
| 333 |
+
def transform(self, data_path: str) -> DataTransformationArtifact:
|
| 334 |
+
"""
|
| 335 |
+
Execute data transformation pipeline.
|
| 336 |
+
Integrates with Vectorizer Agent Graph for LLM-enhanced processing.
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
data_path: Path to validated data
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
DataTransformationArtifact with paths and statistics
|
| 343 |
+
"""
|
| 344 |
+
import json
|
| 345 |
+
|
| 346 |
+
logger.info(f"[DataTransformation] Starting transformation: {data_path}")
|
| 347 |
+
|
| 348 |
+
# Load data
|
| 349 |
+
df = pd.read_parquet(data_path)
|
| 350 |
+
total_records = len(df)
|
| 351 |
+
logger.info(f"[DataTransformation] Loaded {total_records} records")
|
| 352 |
+
|
| 353 |
+
# Initialize agent graph results
|
| 354 |
+
agent_result = None
|
| 355 |
+
expert_summary = None
|
| 356 |
+
|
| 357 |
+
# Try to process with vectorizer agent graph first
|
| 358 |
+
if self.vectorizer_graph and self.use_agent_graph:
|
| 359 |
+
logger.info("[DataTransformation] Using Vectorizer Agent Graph...")
|
| 360 |
+
|
| 361 |
+
# Prepare texts for agent graph
|
| 362 |
+
texts_for_agent = []
|
| 363 |
+
for idx, row in df.iterrows():
|
| 364 |
+
texts_for_agent.append({
|
| 365 |
+
"post_id": str(row.get("id", idx)),
|
| 366 |
+
"text": str(row.get("text", "")),
|
| 367 |
+
"metadata": {
|
| 368 |
+
"source": row.get("source", "unknown"),
|
| 369 |
+
"timestamp": str(row.get("timestamp", ""))
|
| 370 |
+
}
|
| 371 |
+
})
|
| 372 |
+
|
| 373 |
+
# Process through agent graph
|
| 374 |
+
agent_result = self._process_with_agent_graph(texts_for_agent)
|
| 375 |
+
|
| 376 |
+
if agent_result:
|
| 377 |
+
expert_summary = agent_result.get("expert_summary", "")
|
| 378 |
+
logger.info(f"[DataTransformation] Agent graph completed with expert summary")
|
| 379 |
+
|
| 380 |
+
# Run standard transformations (fallback or additional)
|
| 381 |
+
df = self._detect_languages(df)
|
| 382 |
+
df = self._extract_temporal_features(df)
|
| 383 |
+
df = self._extract_engagement_features(df)
|
| 384 |
+
df = self._extract_text_features(df)
|
| 385 |
+
|
| 386 |
+
# Vectorize texts (use agent result if available, otherwise fallback)
|
| 387 |
+
if agent_result and agent_result.get("vector_embeddings"):
|
| 388 |
+
# Extract vectors from agent graph result
|
| 389 |
+
agent_embeddings = agent_result.get("vector_embeddings", [])
|
| 390 |
+
embeddings = np.array([
|
| 391 |
+
item.get("vector", [0.0] * 768) for item in agent_embeddings
|
| 392 |
+
])
|
| 393 |
+
logger.info(f"[DataTransformation] Using agent graph vectors: {embeddings.shape}")
|
| 394 |
+
else:
|
| 395 |
+
# Fallback to direct vectorization
|
| 396 |
+
embeddings = self._vectorize_texts(df)
|
| 397 |
+
|
| 398 |
+
# Build combined feature matrix
|
| 399 |
+
feature_matrix = self._build_feature_matrix(df, embeddings)
|
| 400 |
+
|
| 401 |
+
# Save outputs
|
| 402 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 403 |
+
|
| 404 |
+
# Save transformed dataframe
|
| 405 |
+
transformed_path = Path(self.config.output_directory) / f"transformed_data_{timestamp}.parquet"
|
| 406 |
+
df.to_parquet(transformed_path, index=False)
|
| 407 |
+
|
| 408 |
+
# Save embeddings
|
| 409 |
+
embeddings_path = Path(self.config.output_directory) / f"embeddings_{timestamp}.npy"
|
| 410 |
+
np.save(embeddings_path, embeddings)
|
| 411 |
+
|
| 412 |
+
# Save feature matrix
|
| 413 |
+
features_path = Path(self.config.output_directory) / f"features_{timestamp}.npy"
|
| 414 |
+
np.save(features_path, feature_matrix)
|
| 415 |
+
|
| 416 |
+
# Save agent graph insights if available
|
| 417 |
+
insights_path = None
|
| 418 |
+
if agent_result:
|
| 419 |
+
insights_path = Path(self.config.output_directory) / f"llm_insights_{timestamp}.json"
|
| 420 |
+
insights_data = {
|
| 421 |
+
"expert_summary": agent_result.get("expert_summary", ""),
|
| 422 |
+
"opportunities": agent_result.get("opportunities", []),
|
| 423 |
+
"threats": agent_result.get("threats", []),
|
| 424 |
+
"domain_insights": agent_result.get("domain_insights", []),
|
| 425 |
+
"processing_stats": agent_result.get("processing_stats", {})
|
| 426 |
+
}
|
| 427 |
+
with open(insights_path, "w", encoding="utf-8") as f:
|
| 428 |
+
json.dump(insights_data, f, indent=2, ensure_ascii=False)
|
| 429 |
+
logger.info(f"[DataTransformation] Saved LLM insights to {insights_path}")
|
| 430 |
+
|
| 431 |
+
# Language distribution
|
| 432 |
+
lang_dist = df["language"].value_counts().to_dict()
|
| 433 |
+
|
| 434 |
+
# Build report
|
| 435 |
+
report = {
|
| 436 |
+
"timestamp": timestamp,
|
| 437 |
+
"total_records": total_records,
|
| 438 |
+
"embedding_dim": embeddings.shape[1] if len(embeddings.shape) > 1 else 0,
|
| 439 |
+
"feature_dim": feature_matrix.shape[1],
|
| 440 |
+
"language_distribution": lang_dist,
|
| 441 |
+
"used_agent_graph": agent_result is not None,
|
| 442 |
+
"expert_summary_available": expert_summary is not None
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
artifact = DataTransformationArtifact(
|
| 446 |
+
transformed_data_path=str(transformed_path),
|
| 447 |
+
vector_embeddings_path=str(embeddings_path),
|
| 448 |
+
feature_store_path=str(features_path),
|
| 449 |
+
total_records=total_records,
|
| 450 |
+
language_distribution=lang_dist,
|
| 451 |
+
transformation_report=report
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
logger.info(f"[DataTransformation] ✓ Complete: {feature_matrix.shape}")
|
| 455 |
+
if agent_result:
|
| 456 |
+
logger.info(f"[DataTransformation] ✓ LLM Expert Summary: {len(expert_summary or '')} chars")
|
| 457 |
+
return artifact
|
| 458 |
+
|
models/anomaly-detection/src/components/data_validation.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/components/data_validation.py
|
| 3 |
+
Data validation component based on schema.yaml
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import yaml
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, List, Dict, Any
|
| 12 |
+
|
| 13 |
+
from ..entity import DataValidationConfig, DataValidationArtifact
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger("data_validation")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DataValidation:
|
| 19 |
+
"""
|
| 20 |
+
Data validation component that validates feed data against schema.
|
| 21 |
+
Checks column types, required fields, and value constraints.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, config: Optional[DataValidationConfig] = None):
|
| 25 |
+
"""
|
| 26 |
+
Initialize data validation component.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
config: Optional configuration, uses defaults if None
|
| 30 |
+
"""
|
| 31 |
+
self.config = config or DataValidationConfig()
|
| 32 |
+
|
| 33 |
+
# Ensure output directory exists
|
| 34 |
+
Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
# Load schema
|
| 37 |
+
self.schema = self._load_schema()
|
| 38 |
+
|
| 39 |
+
logger.info(f"[DataValidation] Initialized with schema: {self.config.schema_file}")
|
| 40 |
+
|
| 41 |
+
def _load_schema(self) -> Dict[str, Any]:
|
| 42 |
+
"""Load schema from YAML file"""
|
| 43 |
+
if not os.path.exists(self.config.schema_file):
|
| 44 |
+
logger.warning(f"[DataValidation] Schema file not found: {self.config.schema_file}")
|
| 45 |
+
return {}
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
with open(self.config.schema_file, 'r', encoding='utf-8') as f:
|
| 49 |
+
return yaml.safe_load(f)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error(f"[DataValidation] Failed to load schema: {e}")
|
| 52 |
+
return {}
|
| 53 |
+
|
| 54 |
+
def _validate_required_columns(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
|
| 55 |
+
"""
|
| 56 |
+
Check that all required columns are present.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
List of validation errors
|
| 60 |
+
"""
|
| 61 |
+
errors = []
|
| 62 |
+
|
| 63 |
+
for col in self.config.required_columns:
|
| 64 |
+
if col not in df.columns:
|
| 65 |
+
errors.append({
|
| 66 |
+
"type": "missing_column",
|
| 67 |
+
"column": col,
|
| 68 |
+
"message": f"Required column '{col}' is missing"
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
return errors
|
| 72 |
+
|
| 73 |
+
def _validate_column_types(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
|
| 74 |
+
"""
|
| 75 |
+
Validate column data types based on schema.
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
List of validation errors
|
| 79 |
+
"""
|
| 80 |
+
errors = []
|
| 81 |
+
|
| 82 |
+
if "feed_columns" not in self.schema:
|
| 83 |
+
return errors
|
| 84 |
+
|
| 85 |
+
for col_name, col_spec in self.schema["feed_columns"].items():
|
| 86 |
+
if col_name not in df.columns:
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
expected_dtype = col_spec.get("dtype", "str")
|
| 90 |
+
|
| 91 |
+
# Check for null values in required columns
|
| 92 |
+
if col_spec.get("required", False):
|
| 93 |
+
null_count = df[col_name].isna().sum()
|
| 94 |
+
if null_count > 0:
|
| 95 |
+
errors.append({
|
| 96 |
+
"type": "null_values",
|
| 97 |
+
"column": col_name,
|
| 98 |
+
"count": int(null_count),
|
| 99 |
+
"message": f"Column '{col_name}' has {null_count} null values"
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
# Check min/max length for strings
|
| 103 |
+
if expected_dtype == "str" and col_name in df.columns:
|
| 104 |
+
min_len = col_spec.get("min_length", 0)
|
| 105 |
+
max_len = col_spec.get("max_length", float('inf'))
|
| 106 |
+
|
| 107 |
+
if min_len > 0:
|
| 108 |
+
short_count = (df[col_name].fillna("").str.len() < min_len).sum()
|
| 109 |
+
if short_count > 0:
|
| 110 |
+
errors.append({
|
| 111 |
+
"type": "min_length_violation",
|
| 112 |
+
"column": col_name,
|
| 113 |
+
"count": int(short_count),
|
| 114 |
+
"message": f"Column '{col_name}' has {short_count} values shorter than {min_len}"
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
# Check allowed values
|
| 118 |
+
allowed = col_spec.get("allowed_values")
|
| 119 |
+
if allowed and col_name in df.columns:
|
| 120 |
+
invalid_mask = ~df[col_name].isin(allowed) & df[col_name].notna()
|
| 121 |
+
invalid_count = invalid_mask.sum()
|
| 122 |
+
if invalid_count > 0:
|
| 123 |
+
errors.append({
|
| 124 |
+
"type": "invalid_value",
|
| 125 |
+
"column": col_name,
|
| 126 |
+
"count": int(invalid_count),
|
| 127 |
+
"allowed": allowed,
|
| 128 |
+
"message": f"Column '{col_name}' has {invalid_count} values not in allowed list"
|
| 129 |
+
})
|
| 130 |
+
|
| 131 |
+
return errors
|
| 132 |
+
|
| 133 |
+
def _validate_numeric_ranges(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
|
| 134 |
+
"""
|
| 135 |
+
Validate numeric column ranges.
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
List of validation errors
|
| 139 |
+
"""
|
| 140 |
+
errors = []
|
| 141 |
+
|
| 142 |
+
if "feed_columns" not in self.schema:
|
| 143 |
+
return errors
|
| 144 |
+
|
| 145 |
+
for col_name, col_spec in self.schema["feed_columns"].items():
|
| 146 |
+
if col_name not in df.columns:
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
expected_dtype = col_spec.get("dtype")
|
| 150 |
+
|
| 151 |
+
if expected_dtype in ["int", "float"]:
|
| 152 |
+
min_val = col_spec.get("min_value")
|
| 153 |
+
max_val = col_spec.get("max_value")
|
| 154 |
+
|
| 155 |
+
if min_val is not None:
|
| 156 |
+
try:
|
| 157 |
+
below_count = (pd.to_numeric(df[col_name], errors='coerce') < min_val).sum()
|
| 158 |
+
if below_count > 0:
|
| 159 |
+
errors.append({
|
| 160 |
+
"type": "below_minimum",
|
| 161 |
+
"column": col_name,
|
| 162 |
+
"count": int(below_count),
|
| 163 |
+
"min_value": min_val,
|
| 164 |
+
"message": f"Column '{col_name}' has {below_count} values below {min_val}"
|
| 165 |
+
})
|
| 166 |
+
except Exception:
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
if max_val is not None:
|
| 170 |
+
try:
|
| 171 |
+
above_count = (pd.to_numeric(df[col_name], errors='coerce') > max_val).sum()
|
| 172 |
+
if above_count > 0:
|
| 173 |
+
errors.append({
|
| 174 |
+
"type": "above_maximum",
|
| 175 |
+
"column": col_name,
|
| 176 |
+
"count": int(above_count),
|
| 177 |
+
"max_value": max_val,
|
| 178 |
+
"message": f"Column '{col_name}' has {above_count} values above {max_val}"
|
| 179 |
+
})
|
| 180 |
+
except Exception:
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
return errors
|
| 184 |
+
|
| 185 |
+
def validate(self, data_path: str) -> DataValidationArtifact:
|
| 186 |
+
"""
|
| 187 |
+
Execute data validation pipeline.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
data_path: Path to input data (parquet or csv)
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
DataValidationArtifact with validation results
|
| 194 |
+
"""
|
| 195 |
+
logger.info(f"[DataValidation] Validating: {data_path}")
|
| 196 |
+
|
| 197 |
+
# Load data
|
| 198 |
+
if data_path.endswith(".parquet"):
|
| 199 |
+
df = pd.read_parquet(data_path)
|
| 200 |
+
elif data_path.endswith(".csv"):
|
| 201 |
+
df = pd.read_csv(data_path)
|
| 202 |
+
else:
|
| 203 |
+
raise ValueError(f"Unsupported file format: {data_path}")
|
| 204 |
+
|
| 205 |
+
total_records = len(df)
|
| 206 |
+
logger.info(f"[DataValidation] Loaded {total_records} records")
|
| 207 |
+
|
| 208 |
+
# Run validations
|
| 209 |
+
all_errors = []
|
| 210 |
+
all_errors.extend(self._validate_required_columns(df))
|
| 211 |
+
all_errors.extend(self._validate_column_types(df))
|
| 212 |
+
all_errors.extend(self._validate_numeric_ranges(df))
|
| 213 |
+
|
| 214 |
+
# Calculate valid/invalid records
|
| 215 |
+
invalid_records = 0
|
| 216 |
+
for error in all_errors:
|
| 217 |
+
if "count" in error:
|
| 218 |
+
invalid_records = max(invalid_records, error["count"])
|
| 219 |
+
|
| 220 |
+
valid_records = total_records - invalid_records
|
| 221 |
+
validation_status = len(all_errors) == 0
|
| 222 |
+
|
| 223 |
+
# Log validation results
|
| 224 |
+
if validation_status:
|
| 225 |
+
logger.info("[DataValidation] ✓ All validations passed")
|
| 226 |
+
else:
|
| 227 |
+
logger.warning(f"[DataValidation] ⚠ Found {len(all_errors)} validation issues")
|
| 228 |
+
for error in all_errors[:5]: # Log first 5
|
| 229 |
+
logger.warning(f" - {error['message']}")
|
| 230 |
+
|
| 231 |
+
# Save validated data (even with warnings, we continue)
|
| 232 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 233 |
+
validated_path = Path(self.config.output_directory) / f"validated_data_{timestamp}.parquet"
|
| 234 |
+
df.to_parquet(validated_path, index=False)
|
| 235 |
+
|
| 236 |
+
# Save validation report
|
| 237 |
+
report_path = Path(self.config.output_directory) / f"validation_report_{timestamp}.yaml"
|
| 238 |
+
report = {
|
| 239 |
+
"validation_timestamp": timestamp,
|
| 240 |
+
"input_path": data_path,
|
| 241 |
+
"total_records": total_records,
|
| 242 |
+
"valid_records": valid_records,
|
| 243 |
+
"invalid_records": invalid_records,
|
| 244 |
+
"validation_status": validation_status,
|
| 245 |
+
"errors": all_errors
|
| 246 |
+
}
|
| 247 |
+
with open(report_path, 'w') as f:
|
| 248 |
+
yaml.dump(report, f, default_flow_style=False)
|
| 249 |
+
|
| 250 |
+
artifact = DataValidationArtifact(
|
| 251 |
+
validated_data_path=str(validated_path),
|
| 252 |
+
validation_report_path=str(report_path),
|
| 253 |
+
total_records=total_records,
|
| 254 |
+
valid_records=valid_records,
|
| 255 |
+
invalid_records=invalid_records,
|
| 256 |
+
validation_status=validation_status,
|
| 257 |
+
validation_errors=all_errors
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
logger.info(f"[DataValidation] ✓ Complete: {valid_records}/{total_records} valid records")
|
| 261 |
+
return artifact
|
models/anomaly-detection/src/components/model_trainer.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/components/model_trainer.py
|
| 3 |
+
Model training with Optuna hyperparameter tuning for clustering/anomaly detection
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
import joblib
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional, Dict, Any, List
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from ..entity import ModelTrainerConfig, ModelTrainerArtifact
|
| 14 |
+
from ..utils import calculate_clustering_metrics, calculate_optuna_objective, format_metrics_report
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger("model_trainer")
|
| 17 |
+
|
| 18 |
+
# MLflow
|
| 19 |
+
try:
|
| 20 |
+
import mlflow
|
| 21 |
+
import mlflow.sklearn
|
| 22 |
+
MLFLOW_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
MLFLOW_AVAILABLE = False
|
| 25 |
+
logger.warning("MLflow not available. Install with: pip install mlflow")
|
| 26 |
+
|
| 27 |
+
# Optuna
|
| 28 |
+
try:
|
| 29 |
+
import optuna
|
| 30 |
+
from optuna.samplers import TPESampler
|
| 31 |
+
OPTUNA_AVAILABLE = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
OPTUNA_AVAILABLE = False
|
| 34 |
+
logger.warning("Optuna not available. Install with: pip install optuna")
|
| 35 |
+
|
| 36 |
+
# Clustering algorithms
|
| 37 |
+
try:
|
| 38 |
+
from sklearn.cluster import DBSCAN, KMeans
|
| 39 |
+
from sklearn.ensemble import IsolationForest
|
| 40 |
+
from sklearn.neighbors import LocalOutlierFactor
|
| 41 |
+
SKLEARN_AVAILABLE = True
|
| 42 |
+
except ImportError:
|
| 43 |
+
SKLEARN_AVAILABLE = False
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
import hdbscan
|
| 47 |
+
HDBSCAN_AVAILABLE = True
|
| 48 |
+
except ImportError:
|
| 49 |
+
HDBSCAN_AVAILABLE = False
|
| 50 |
+
logger.warning("HDBSCAN not available. Install with: pip install hdbscan")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class ModelTrainer:
|
| 54 |
+
"""
|
| 55 |
+
Model training component with:
|
| 56 |
+
1. Optuna hyperparameter optimization
|
| 57 |
+
2. Multiple clustering algorithms (DBSCAN, KMeans, HDBSCAN)
|
| 58 |
+
3. Anomaly detection (Isolation Forest, LOF)
|
| 59 |
+
4. MLflow experiment tracking
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(self, config: Optional[ModelTrainerConfig] = None):
|
| 63 |
+
"""
|
| 64 |
+
Initialize model trainer.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
config: Optional configuration
|
| 68 |
+
"""
|
| 69 |
+
self.config = config or ModelTrainerConfig()
|
| 70 |
+
|
| 71 |
+
# Ensure output directory exists
|
| 72 |
+
Path(self.config.output_directory).mkdir(parents=True, exist_ok=True)
|
| 73 |
+
|
| 74 |
+
# Setup MLflow
|
| 75 |
+
self._setup_mlflow()
|
| 76 |
+
|
| 77 |
+
logger.info(f"[ModelTrainer] Initialized")
|
| 78 |
+
logger.info(f" Models to train: {self.config.models_to_train}")
|
| 79 |
+
logger.info(f" Optuna trials: {self.config.n_optuna_trials}")
|
| 80 |
+
|
| 81 |
+
def _setup_mlflow(self):
|
| 82 |
+
"""Configure MLflow tracking"""
|
| 83 |
+
if not MLFLOW_AVAILABLE:
|
| 84 |
+
logger.warning("[ModelTrainer] MLflow not available")
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Set tracking URI
|
| 89 |
+
mlflow.set_tracking_uri(self.config.mlflow_tracking_uri)
|
| 90 |
+
|
| 91 |
+
# Set credentials for DagsHub
|
| 92 |
+
if self.config.mlflow_username and self.config.mlflow_password:
|
| 93 |
+
os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.mlflow_username
|
| 94 |
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.mlflow_password
|
| 95 |
+
|
| 96 |
+
# Create or get experiment
|
| 97 |
+
try:
|
| 98 |
+
mlflow.create_experiment(self.config.experiment_name)
|
| 99 |
+
except Exception:
|
| 100 |
+
pass
|
| 101 |
+
mlflow.set_experiment(self.config.experiment_name)
|
| 102 |
+
|
| 103 |
+
logger.info(f"[ModelTrainer] MLflow configured: {self.config.mlflow_tracking_uri}")
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.warning(f"[ModelTrainer] MLflow setup error: {e}")
|
| 107 |
+
|
| 108 |
+
def _train_dbscan(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
|
| 109 |
+
"""
|
| 110 |
+
Train DBSCAN with optional Optuna tuning.
|
| 111 |
+
"""
|
| 112 |
+
if not SKLEARN_AVAILABLE:
|
| 113 |
+
return {"error": "sklearn not available"}
|
| 114 |
+
|
| 115 |
+
# Hyperparameters
|
| 116 |
+
if trial:
|
| 117 |
+
eps = trial.suggest_float("eps", 0.1, 2.0)
|
| 118 |
+
min_samples = trial.suggest_int("min_samples", 2, 20)
|
| 119 |
+
else:
|
| 120 |
+
eps = 0.5
|
| 121 |
+
min_samples = 5
|
| 122 |
+
|
| 123 |
+
model = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
|
| 124 |
+
labels = model.fit_predict(X)
|
| 125 |
+
|
| 126 |
+
metrics = calculate_clustering_metrics(X, labels)
|
| 127 |
+
metrics["eps"] = eps
|
| 128 |
+
metrics["min_samples"] = min_samples
|
| 129 |
+
|
| 130 |
+
return {
|
| 131 |
+
"model": model,
|
| 132 |
+
"labels": labels,
|
| 133 |
+
"metrics": metrics,
|
| 134 |
+
"params": {"eps": eps, "min_samples": min_samples}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
def _train_kmeans(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
|
| 138 |
+
"""
|
| 139 |
+
Train KMeans with optional Optuna tuning.
|
| 140 |
+
"""
|
| 141 |
+
if not SKLEARN_AVAILABLE:
|
| 142 |
+
return {"error": "sklearn not available"}
|
| 143 |
+
|
| 144 |
+
# Hyperparameters
|
| 145 |
+
if trial:
|
| 146 |
+
n_clusters = trial.suggest_int("n_clusters", 2, 20)
|
| 147 |
+
n_init = trial.suggest_int("n_init", 5, 20)
|
| 148 |
+
else:
|
| 149 |
+
n_clusters = 5
|
| 150 |
+
n_init = 10
|
| 151 |
+
|
| 152 |
+
model = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=42)
|
| 153 |
+
labels = model.fit_predict(X)
|
| 154 |
+
|
| 155 |
+
metrics = calculate_clustering_metrics(X, labels)
|
| 156 |
+
metrics["n_clusters"] = n_clusters
|
| 157 |
+
|
| 158 |
+
return {
|
| 159 |
+
"model": model,
|
| 160 |
+
"labels": labels,
|
| 161 |
+
"metrics": metrics,
|
| 162 |
+
"params": {"n_clusters": n_clusters, "n_init": n_init}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
def _train_hdbscan(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
|
| 166 |
+
"""
|
| 167 |
+
Train HDBSCAN with optional Optuna tuning.
|
| 168 |
+
"""
|
| 169 |
+
if not HDBSCAN_AVAILABLE:
|
| 170 |
+
return {"error": "hdbscan not available"}
|
| 171 |
+
|
| 172 |
+
# Hyperparameters
|
| 173 |
+
if trial:
|
| 174 |
+
min_cluster_size = trial.suggest_int("min_cluster_size", 5, 50)
|
| 175 |
+
min_samples = trial.suggest_int("min_samples", 1, 20)
|
| 176 |
+
else:
|
| 177 |
+
min_cluster_size = 15
|
| 178 |
+
min_samples = 5
|
| 179 |
+
|
| 180 |
+
model = hdbscan.HDBSCAN(
|
| 181 |
+
min_cluster_size=min_cluster_size,
|
| 182 |
+
min_samples=min_samples,
|
| 183 |
+
core_dist_n_jobs=-1
|
| 184 |
+
)
|
| 185 |
+
labels = model.fit_predict(X)
|
| 186 |
+
|
| 187 |
+
metrics = calculate_clustering_metrics(X, labels)
|
| 188 |
+
|
| 189 |
+
return {
|
| 190 |
+
"model": model,
|
| 191 |
+
"labels": labels,
|
| 192 |
+
"metrics": metrics,
|
| 193 |
+
"params": {"min_cluster_size": min_cluster_size, "min_samples": min_samples}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
def _train_isolation_forest(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
|
| 197 |
+
"""
|
| 198 |
+
Train Isolation Forest for anomaly detection.
|
| 199 |
+
"""
|
| 200 |
+
if not SKLEARN_AVAILABLE:
|
| 201 |
+
return {"error": "sklearn not available"}
|
| 202 |
+
|
| 203 |
+
# Hyperparameters
|
| 204 |
+
if trial:
|
| 205 |
+
contamination = trial.suggest_float("contamination", 0.01, 0.3)
|
| 206 |
+
n_estimators = trial.suggest_int("n_estimators", 50, 200)
|
| 207 |
+
else:
|
| 208 |
+
contamination = 0.1
|
| 209 |
+
n_estimators = 100
|
| 210 |
+
|
| 211 |
+
model = IsolationForest(
|
| 212 |
+
contamination=contamination,
|
| 213 |
+
n_estimators=n_estimators,
|
| 214 |
+
random_state=42,
|
| 215 |
+
n_jobs=-1
|
| 216 |
+
)
|
| 217 |
+
predictions = model.fit_predict(X)
|
| 218 |
+
labels = (predictions == -1).astype(int) # -1 = anomaly
|
| 219 |
+
|
| 220 |
+
n_anomalies = int(np.sum(labels))
|
| 221 |
+
|
| 222 |
+
return {
|
| 223 |
+
"model": model,
|
| 224 |
+
"labels": labels,
|
| 225 |
+
"metrics": {
|
| 226 |
+
"n_anomalies": n_anomalies,
|
| 227 |
+
"anomaly_rate": n_anomalies / len(X),
|
| 228 |
+
"contamination": contamination,
|
| 229 |
+
"n_estimators": n_estimators
|
| 230 |
+
},
|
| 231 |
+
"params": {"contamination": contamination, "n_estimators": n_estimators},
|
| 232 |
+
"anomaly_indices": np.where(labels == 1)[0].tolist()
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
def _train_lof(self, X: np.ndarray, trial: Optional['optuna.Trial'] = None) -> Dict[str, Any]:
|
| 236 |
+
"""
|
| 237 |
+
Train Local Outlier Factor for anomaly detection.
|
| 238 |
+
"""
|
| 239 |
+
if not SKLEARN_AVAILABLE:
|
| 240 |
+
return {"error": "sklearn not available"}
|
| 241 |
+
|
| 242 |
+
# Hyperparameters
|
| 243 |
+
if trial:
|
| 244 |
+
n_neighbors = trial.suggest_int("n_neighbors", 5, 50)
|
| 245 |
+
contamination = trial.suggest_float("contamination", 0.01, 0.3)
|
| 246 |
+
else:
|
| 247 |
+
n_neighbors = 20
|
| 248 |
+
contamination = 0.1
|
| 249 |
+
|
| 250 |
+
model = LocalOutlierFactor(
|
| 251 |
+
n_neighbors=n_neighbors,
|
| 252 |
+
contamination=contamination,
|
| 253 |
+
n_jobs=-1,
|
| 254 |
+
novelty=True # For prediction on new data
|
| 255 |
+
)
|
| 256 |
+
model.fit(X)
|
| 257 |
+
predictions = model.predict(X)
|
| 258 |
+
labels = (predictions == -1).astype(int) # -1 = anomaly
|
| 259 |
+
|
| 260 |
+
n_anomalies = int(np.sum(labels))
|
| 261 |
+
|
| 262 |
+
return {
|
| 263 |
+
"model": model,
|
| 264 |
+
"labels": labels,
|
| 265 |
+
"metrics": {
|
| 266 |
+
"n_anomalies": n_anomalies,
|
| 267 |
+
"anomaly_rate": n_anomalies / len(X),
|
| 268 |
+
"n_neighbors": n_neighbors,
|
| 269 |
+
"contamination": contamination
|
| 270 |
+
},
|
| 271 |
+
"params": {"n_neighbors": n_neighbors, "contamination": contamination},
|
| 272 |
+
"anomaly_indices": np.where(labels == 1)[0].tolist()
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
def _optimize_model(self, model_name: str, X: np.ndarray) -> Dict[str, Any]:
|
| 276 |
+
"""
|
| 277 |
+
Use Optuna to find best hyperparameters for a model.
|
| 278 |
+
"""
|
| 279 |
+
if not OPTUNA_AVAILABLE:
|
| 280 |
+
logger.warning("[ModelTrainer] Optuna not available, using defaults")
|
| 281 |
+
return self._train_model(model_name, X, None)
|
| 282 |
+
|
| 283 |
+
train_func = {
|
| 284 |
+
"dbscan": self._train_dbscan,
|
| 285 |
+
"kmeans": self._train_kmeans,
|
| 286 |
+
"hdbscan": self._train_hdbscan,
|
| 287 |
+
"isolation_forest": self._train_isolation_forest,
|
| 288 |
+
"lof": self._train_lof
|
| 289 |
+
}.get(model_name)
|
| 290 |
+
|
| 291 |
+
if not train_func:
|
| 292 |
+
return {"error": f"Unknown model: {model_name}"}
|
| 293 |
+
|
| 294 |
+
def objective(trial):
|
| 295 |
+
try:
|
| 296 |
+
result = train_func(X, trial)
|
| 297 |
+
if "error" in result:
|
| 298 |
+
return -1.0
|
| 299 |
+
|
| 300 |
+
metrics = result.get("metrics", {})
|
| 301 |
+
|
| 302 |
+
# For clustering: use silhouette
|
| 303 |
+
if model_name in ["dbscan", "kmeans", "hdbscan"]:
|
| 304 |
+
score = metrics.get("silhouette_score", -1)
|
| 305 |
+
return score if score is not None else -1
|
| 306 |
+
|
| 307 |
+
# For anomaly detection: balance anomaly rate
|
| 308 |
+
else:
|
| 309 |
+
# Target anomaly rate around 5-15%
|
| 310 |
+
rate = metrics.get("anomaly_rate", 0)
|
| 311 |
+
target = 0.1
|
| 312 |
+
return -abs(rate - target) # Closer to target is better
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.debug(f"Trial failed: {e}")
|
| 316 |
+
return -1.0
|
| 317 |
+
|
| 318 |
+
# Create and run study
|
| 319 |
+
study = optuna.create_study(
|
| 320 |
+
direction="maximize",
|
| 321 |
+
sampler=TPESampler(seed=42)
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
study.optimize(
|
| 325 |
+
objective,
|
| 326 |
+
n_trials=self.config.n_optuna_trials,
|
| 327 |
+
timeout=self.config.optuna_timeout_seconds,
|
| 328 |
+
show_progress_bar=True
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
logger.info(f"[ModelTrainer] {model_name} best params: {study.best_params}")
|
| 332 |
+
logger.info(f"[ModelTrainer] {model_name} best score: {study.best_value:.4f}")
|
| 333 |
+
|
| 334 |
+
# Train with best params
|
| 335 |
+
best_result = train_func(X, None) # Use defaults as base
|
| 336 |
+
# Override with best params
|
| 337 |
+
if study.best_params:
|
| 338 |
+
# Re-train with best params would require custom logic
|
| 339 |
+
# For now, we just log the best params
|
| 340 |
+
best_result["best_params"] = study.best_params
|
| 341 |
+
best_result["best_score"] = study.best_value
|
| 342 |
+
best_result["study_name"] = study.study_name
|
| 343 |
+
|
| 344 |
+
return best_result
|
| 345 |
+
|
| 346 |
+
def _train_model(self, model_name: str, X: np.ndarray, trial=None) -> Dict[str, Any]:
|
| 347 |
+
"""Train a single model"""
|
| 348 |
+
train_funcs = {
|
| 349 |
+
"dbscan": self._train_dbscan,
|
| 350 |
+
"kmeans": self._train_kmeans,
|
| 351 |
+
"hdbscan": self._train_hdbscan,
|
| 352 |
+
"isolation_forest": self._train_isolation_forest,
|
| 353 |
+
"lof": self._train_lof
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
func = train_funcs.get(model_name)
|
| 357 |
+
if func:
|
| 358 |
+
return func(X, trial)
|
| 359 |
+
return {"error": f"Unknown model: {model_name}"}
|
| 360 |
+
|
| 361 |
+
def train(self, feature_path: str) -> ModelTrainerArtifact:
|
| 362 |
+
"""
|
| 363 |
+
Execute model training pipeline.
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
feature_path: Path to feature matrix (.npy)
|
| 367 |
+
|
| 368 |
+
Returns:
|
| 369 |
+
ModelTrainerArtifact with results
|
| 370 |
+
"""
|
| 371 |
+
logger.info(f"[ModelTrainer] Starting training: {feature_path}")
|
| 372 |
+
start_time = datetime.now()
|
| 373 |
+
|
| 374 |
+
# Load features
|
| 375 |
+
X = np.load(feature_path)
|
| 376 |
+
logger.info(f"[ModelTrainer] Loaded features: {X.shape}")
|
| 377 |
+
|
| 378 |
+
# Start MLflow run
|
| 379 |
+
mlflow_run_id = ""
|
| 380 |
+
mlflow_experiment_id = ""
|
| 381 |
+
|
| 382 |
+
if MLFLOW_AVAILABLE:
|
| 383 |
+
try:
|
| 384 |
+
run = mlflow.start_run()
|
| 385 |
+
mlflow_run_id = run.info.run_id
|
| 386 |
+
mlflow_experiment_id = run.info.experiment_id
|
| 387 |
+
|
| 388 |
+
mlflow.log_param("n_samples", X.shape[0])
|
| 389 |
+
mlflow.log_param("n_features", X.shape[1])
|
| 390 |
+
mlflow.log_param("models", self.config.models_to_train)
|
| 391 |
+
except Exception as e:
|
| 392 |
+
logger.warning(f"[ModelTrainer] MLflow run start error: {e}")
|
| 393 |
+
|
| 394 |
+
# Train all models
|
| 395 |
+
trained_models = []
|
| 396 |
+
best_model = None
|
| 397 |
+
best_score = -float('inf')
|
| 398 |
+
|
| 399 |
+
for model_name in self.config.models_to_train:
|
| 400 |
+
logger.info(f"[ModelTrainer] Training {model_name}...")
|
| 401 |
+
|
| 402 |
+
try:
|
| 403 |
+
result = self._optimize_model(model_name, X)
|
| 404 |
+
|
| 405 |
+
if "error" in result:
|
| 406 |
+
logger.warning(f"[ModelTrainer] {model_name} error: {result['error']}")
|
| 407 |
+
continue
|
| 408 |
+
|
| 409 |
+
# Save model
|
| 410 |
+
model_path = Path(self.config.output_directory) / f"{model_name}_model.joblib"
|
| 411 |
+
joblib.dump(result["model"], model_path)
|
| 412 |
+
|
| 413 |
+
# Log to MLflow
|
| 414 |
+
if MLFLOW_AVAILABLE:
|
| 415 |
+
try:
|
| 416 |
+
mlflow.log_params({f"{model_name}_{k}": v for k, v in result.get("params", {}).items()})
|
| 417 |
+
mlflow.log_metrics({f"{model_name}_{k}": v for k, v in result.get("metrics", {}).items() if isinstance(v, (int, float))})
|
| 418 |
+
mlflow.sklearn.log_model(result["model"], model_name)
|
| 419 |
+
except Exception as e:
|
| 420 |
+
logger.debug(f"MLflow log error: {e}")
|
| 421 |
+
|
| 422 |
+
# Track results
|
| 423 |
+
model_info = {
|
| 424 |
+
"name": model_name,
|
| 425 |
+
"path": str(model_path),
|
| 426 |
+
"params": result.get("params", {}),
|
| 427 |
+
"metrics": result.get("metrics", {})
|
| 428 |
+
}
|
| 429 |
+
trained_models.append(model_info)
|
| 430 |
+
|
| 431 |
+
# Check if best (for clustering models)
|
| 432 |
+
score = result.get("metrics", {}).get("silhouette_score", -1)
|
| 433 |
+
if score and score > best_score:
|
| 434 |
+
best_score = score
|
| 435 |
+
best_model = model_info
|
| 436 |
+
|
| 437 |
+
logger.info(f"[ModelTrainer] ✓ {model_name} complete")
|
| 438 |
+
|
| 439 |
+
except Exception as e:
|
| 440 |
+
logger.error(f"[ModelTrainer] {model_name} failed: {e}")
|
| 441 |
+
|
| 442 |
+
# End MLflow run
|
| 443 |
+
if MLFLOW_AVAILABLE:
|
| 444 |
+
try:
|
| 445 |
+
mlflow.end_run()
|
| 446 |
+
except Exception:
|
| 447 |
+
pass
|
| 448 |
+
|
| 449 |
+
# Calculate duration
|
| 450 |
+
duration = (datetime.now() - start_time).total_seconds()
|
| 451 |
+
|
| 452 |
+
# Get anomaly info from best anomaly detector
|
| 453 |
+
n_anomalies = None
|
| 454 |
+
anomaly_indices = None
|
| 455 |
+
for model_info in trained_models:
|
| 456 |
+
if model_info["name"] in ["isolation_forest", "lof"]:
|
| 457 |
+
n_anomalies = model_info["metrics"].get("n_anomalies")
|
| 458 |
+
break
|
| 459 |
+
|
| 460 |
+
# Build artifact
|
| 461 |
+
artifact = ModelTrainerArtifact(
|
| 462 |
+
best_model_name=best_model["name"] if best_model else "",
|
| 463 |
+
best_model_path=best_model["path"] if best_model else "",
|
| 464 |
+
best_model_metrics=best_model["metrics"] if best_model else {},
|
| 465 |
+
trained_models=trained_models,
|
| 466 |
+
mlflow_run_id=mlflow_run_id,
|
| 467 |
+
mlflow_experiment_id=mlflow_experiment_id,
|
| 468 |
+
n_clusters=best_model["metrics"].get("n_clusters") if best_model else None,
|
| 469 |
+
n_anomalies=n_anomalies,
|
| 470 |
+
anomaly_indices=anomaly_indices,
|
| 471 |
+
training_duration_seconds=duration,
|
| 472 |
+
optuna_study_name=None
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
logger.info(f"[ModelTrainer] ✓ Training complete in {duration:.1f}s")
|
| 476 |
+
logger.info(f"[ModelTrainer] Best model: {best_model['name'] if best_model else 'N/A'}")
|
| 477 |
+
|
| 478 |
+
return artifact
|
models/anomaly-detection/src/entity/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/entity/__init__.py
|
| 3 |
+
"""
|
| 4 |
+
from .config_entity import (
|
| 5 |
+
DataIngestionConfig,
|
| 6 |
+
DataValidationConfig,
|
| 7 |
+
DataTransformationConfig,
|
| 8 |
+
ModelTrainerConfig,
|
| 9 |
+
PipelineConfig
|
| 10 |
+
)
|
| 11 |
+
from .artifact_entity import (
|
| 12 |
+
DataIngestionArtifact,
|
| 13 |
+
DataValidationArtifact,
|
| 14 |
+
DataTransformationArtifact,
|
| 15 |
+
ModelTrainerArtifact,
|
| 16 |
+
PipelineArtifact
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
__all__ = [
|
| 20 |
+
"DataIngestionConfig",
|
| 21 |
+
"DataValidationConfig",
|
| 22 |
+
"DataTransformationConfig",
|
| 23 |
+
"ModelTrainerConfig",
|
| 24 |
+
"PipelineConfig",
|
| 25 |
+
"DataIngestionArtifact",
|
| 26 |
+
"DataValidationArtifact",
|
| 27 |
+
"DataTransformationArtifact",
|
| 28 |
+
"ModelTrainerArtifact",
|
| 29 |
+
"PipelineArtifact"
|
| 30 |
+
]
|
models/anomaly-detection/src/entity/artifact_entity.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/entity/artifact_entity.py
|
| 3 |
+
Artifact entities for pipeline outputs
|
| 4 |
+
"""
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class DataIngestionArtifact:
|
| 12 |
+
"""Artifact from data ingestion step"""
|
| 13 |
+
raw_data_path: str
|
| 14 |
+
total_records: int
|
| 15 |
+
records_from_sqlite: int
|
| 16 |
+
records_from_csv: int
|
| 17 |
+
ingestion_timestamp: str
|
| 18 |
+
is_data_available: bool
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class DataValidationArtifact:
|
| 23 |
+
"""Artifact from data validation step"""
|
| 24 |
+
validated_data_path: str
|
| 25 |
+
validation_report_path: str
|
| 26 |
+
total_records: int
|
| 27 |
+
valid_records: int
|
| 28 |
+
invalid_records: int
|
| 29 |
+
validation_status: bool
|
| 30 |
+
validation_errors: List[Dict[str, Any]]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class DataTransformationArtifact:
|
| 35 |
+
"""Artifact from data transformation step"""
|
| 36 |
+
transformed_data_path: str
|
| 37 |
+
vector_embeddings_path: str
|
| 38 |
+
feature_store_path: str
|
| 39 |
+
total_records: int
|
| 40 |
+
language_distribution: Dict[str, int]
|
| 41 |
+
transformation_report: Dict[str, Any]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class ModelTrainerArtifact:
|
| 46 |
+
"""Artifact from model training step"""
|
| 47 |
+
# Best model info
|
| 48 |
+
best_model_name: str
|
| 49 |
+
best_model_path: str
|
| 50 |
+
best_model_metrics: Dict[str, float]
|
| 51 |
+
|
| 52 |
+
# All trained models
|
| 53 |
+
trained_models: List[Dict[str, Any]]
|
| 54 |
+
|
| 55 |
+
# MLflow tracking
|
| 56 |
+
mlflow_run_id: str
|
| 57 |
+
mlflow_experiment_id: str
|
| 58 |
+
|
| 59 |
+
# Cluster/anomaly results
|
| 60 |
+
n_clusters: Optional[int]
|
| 61 |
+
n_anomalies: Optional[int]
|
| 62 |
+
anomaly_indices: Optional[List[int]]
|
| 63 |
+
|
| 64 |
+
# Training info
|
| 65 |
+
training_duration_seconds: float
|
| 66 |
+
optuna_study_name: Optional[str]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class PipelineArtifact:
|
| 71 |
+
"""Complete pipeline artifact"""
|
| 72 |
+
data_ingestion: DataIngestionArtifact
|
| 73 |
+
data_validation: DataValidationArtifact
|
| 74 |
+
data_transformation: DataTransformationArtifact
|
| 75 |
+
model_trainer: ModelTrainerArtifact
|
| 76 |
+
pipeline_run_id: str
|
| 77 |
+
pipeline_start_time: str
|
| 78 |
+
pipeline_end_time: str
|
| 79 |
+
pipeline_status: str
|
models/anomaly-detection/src/entity/config_entity.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/entity/config_entity.py
|
| 3 |
+
Configuration entities for the anomaly detection pipeline
|
| 4 |
+
"""
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class DataIngestionConfig:
|
| 13 |
+
"""Configuration for data ingestion component"""
|
| 14 |
+
sqlite_db_path: str = field(default_factory=lambda: os.getenv(
|
| 15 |
+
"SQLITE_DB_PATH",
|
| 16 |
+
str(Path(__file__).parent.parent.parent.parent.parent / "data" / "feeds" / "feed_cache.db")
|
| 17 |
+
))
|
| 18 |
+
csv_directory: str = field(default_factory=lambda: str(
|
| 19 |
+
Path(__file__).parent.parent.parent.parent.parent / "datasets" / "political_feeds"
|
| 20 |
+
))
|
| 21 |
+
output_directory: str = field(default_factory=lambda: str(
|
| 22 |
+
Path(__file__).parent.parent.parent / "artifacts" / "data_ingestion"
|
| 23 |
+
))
|
| 24 |
+
batch_size: int = 1000
|
| 25 |
+
min_text_length: int = 10
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class DataValidationConfig:
|
| 30 |
+
"""Configuration for data validation component"""
|
| 31 |
+
schema_file: str = field(default_factory=lambda: str(
|
| 32 |
+
Path(__file__).parent.parent.parent / "data_schema" / "schema.yaml"
|
| 33 |
+
))
|
| 34 |
+
required_columns: List[str] = field(default_factory=lambda: [
|
| 35 |
+
"post_id", "timestamp", "platform", "category", "text", "content_hash"
|
| 36 |
+
])
|
| 37 |
+
output_directory: str = field(default_factory=lambda: str(
|
| 38 |
+
Path(__file__).parent.parent.parent / "artifacts" / "data_validation"
|
| 39 |
+
))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class DataTransformationConfig:
|
| 44 |
+
"""Configuration for data transformation/vectorization component"""
|
| 45 |
+
# Huggingface models - will be downloaded locally
|
| 46 |
+
models_cache_dir: str = field(default_factory=lambda: str(
|
| 47 |
+
Path(__file__).parent.parent.parent / "models_cache"
|
| 48 |
+
))
|
| 49 |
+
|
| 50 |
+
# Language-specific BERT models
|
| 51 |
+
english_model: str = "distilbert-base-uncased"
|
| 52 |
+
sinhala_model: str = "keshan/SinhalaBERTo"
|
| 53 |
+
tamil_model: str = "l3cube-pune/tamil-bert"
|
| 54 |
+
|
| 55 |
+
# Language detection
|
| 56 |
+
fasttext_model_path: str = field(default_factory=lambda: str(
|
| 57 |
+
Path(__file__).parent.parent.parent / "models_cache" / "lid.176.bin" # FastText language ID model
|
| 58 |
+
))
|
| 59 |
+
|
| 60 |
+
# Vector dimensions
|
| 61 |
+
vector_dim: int = 768 # Standard BERT dimension
|
| 62 |
+
|
| 63 |
+
# Output
|
| 64 |
+
output_directory: str = field(default_factory=lambda: str(
|
| 65 |
+
Path(__file__).parent.parent.parent / "artifacts" / "data_transformation"
|
| 66 |
+
))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class ModelTrainerConfig:
|
| 71 |
+
"""Configuration for model training component"""
|
| 72 |
+
# MLflow configuration
|
| 73 |
+
mlflow_tracking_uri: str = field(default_factory=lambda: os.getenv(
|
| 74 |
+
"MLFLOW_TRACKING_URI", "https://dagshub.com/sliitguy/SecurityNetwork.mlflow"
|
| 75 |
+
))
|
| 76 |
+
mlflow_username: str = field(default_factory=lambda: os.getenv(
|
| 77 |
+
"MLFLOW_TRACKING_USERNAME", ""
|
| 78 |
+
))
|
| 79 |
+
mlflow_password: str = field(default_factory=lambda: os.getenv(
|
| 80 |
+
"MLFLOW_TRACKING_PASSWORD", ""
|
| 81 |
+
))
|
| 82 |
+
experiment_name: str = "anomaly_detection_feeds"
|
| 83 |
+
|
| 84 |
+
# Model configurations
|
| 85 |
+
models_to_train: List[str] = field(default_factory=lambda: [
|
| 86 |
+
"dbscan", "kmeans", "hdbscan", "isolation_forest", "lof"
|
| 87 |
+
])
|
| 88 |
+
|
| 89 |
+
# Optuna hyperparameter tuning
|
| 90 |
+
n_optuna_trials: int = 50
|
| 91 |
+
optuna_timeout_seconds: int = 3600 # 1 hour
|
| 92 |
+
|
| 93 |
+
# Model output
|
| 94 |
+
output_directory: str = field(default_factory=lambda: str(
|
| 95 |
+
Path(__file__).parent.parent.parent / "artifacts" / "model_trainer"
|
| 96 |
+
))
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@dataclass
|
| 100 |
+
class PipelineConfig:
|
| 101 |
+
"""Master configuration for the entire pipeline"""
|
| 102 |
+
data_ingestion: DataIngestionConfig = field(default_factory=DataIngestionConfig)
|
| 103 |
+
data_validation: DataValidationConfig = field(default_factory=DataValidationConfig)
|
| 104 |
+
data_transformation: DataTransformationConfig = field(default_factory=DataTransformationConfig)
|
| 105 |
+
model_trainer: ModelTrainerConfig = field(default_factory=ModelTrainerConfig)
|
| 106 |
+
|
| 107 |
+
# Pipeline settings
|
| 108 |
+
batch_threshold: int = 1000 # Trigger training after this many new records
|
| 109 |
+
run_interval_hours: int = 24 # Fallback daily run
|
models/anomaly-detection/src/pipeline/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/pipeline/__init__.py
|
| 3 |
+
"""
|
| 4 |
+
from .training_pipeline import TrainingPipeline, run_training_pipeline
|
| 5 |
+
|
| 6 |
+
__all__ = ["TrainingPipeline", "run_training_pipeline"]
|
models/anomaly-detection/src/pipeline/training_pipeline.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/pipeline/training_pipeline.py
|
| 3 |
+
End-to-end training pipeline orchestrator
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
from ..entity import (
|
| 11 |
+
PipelineConfig,
|
| 12 |
+
PipelineArtifact,
|
| 13 |
+
DataIngestionArtifact,
|
| 14 |
+
DataValidationArtifact,
|
| 15 |
+
DataTransformationArtifact,
|
| 16 |
+
ModelTrainerArtifact
|
| 17 |
+
)
|
| 18 |
+
from ..components import (
|
| 19 |
+
DataIngestion,
|
| 20 |
+
DataValidation,
|
| 21 |
+
DataTransformation,
|
| 22 |
+
ModelTrainer
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger("training_pipeline")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class TrainingPipeline:
|
| 29 |
+
"""
|
| 30 |
+
End-to-end training pipeline that orchestrates:
|
| 31 |
+
1. Data Ingestion (SQLite + CSV)
|
| 32 |
+
2. Data Validation (schema checking)
|
| 33 |
+
3. Data Transformation (language detection + vectorization)
|
| 34 |
+
4. Model Training (clustering + anomaly detection)
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, config: Optional[PipelineConfig] = None):
|
| 38 |
+
"""
|
| 39 |
+
Initialize training pipeline.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
config: Optional pipeline configuration
|
| 43 |
+
"""
|
| 44 |
+
self.config = config or PipelineConfig()
|
| 45 |
+
self.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 46 |
+
|
| 47 |
+
logger.info(f"[TrainingPipeline] Initialized (run_id: {self.run_id})")
|
| 48 |
+
|
| 49 |
+
def run_data_ingestion(self) -> DataIngestionArtifact:
|
| 50 |
+
"""Execute data ingestion step"""
|
| 51 |
+
logger.info("=" * 50)
|
| 52 |
+
logger.info("[TrainingPipeline] STEP 1: Data Ingestion")
|
| 53 |
+
logger.info("=" * 50)
|
| 54 |
+
|
| 55 |
+
ingestion = DataIngestion(self.config.data_ingestion)
|
| 56 |
+
artifact = ingestion.ingest()
|
| 57 |
+
|
| 58 |
+
if not artifact.is_data_available:
|
| 59 |
+
raise ValueError("No data available for training")
|
| 60 |
+
|
| 61 |
+
return artifact
|
| 62 |
+
|
| 63 |
+
def run_data_validation(self, ingestion_artifact: DataIngestionArtifact) -> DataValidationArtifact:
|
| 64 |
+
"""Execute data validation step"""
|
| 65 |
+
logger.info("=" * 50)
|
| 66 |
+
logger.info("[TrainingPipeline] STEP 2: Data Validation")
|
| 67 |
+
logger.info("=" * 50)
|
| 68 |
+
|
| 69 |
+
validation = DataValidation(self.config.data_validation)
|
| 70 |
+
artifact = validation.validate(ingestion_artifact.raw_data_path)
|
| 71 |
+
|
| 72 |
+
return artifact
|
| 73 |
+
|
| 74 |
+
def run_data_transformation(self, validation_artifact: DataValidationArtifact) -> DataTransformationArtifact:
|
| 75 |
+
"""Execute data transformation step"""
|
| 76 |
+
logger.info("=" * 50)
|
| 77 |
+
logger.info("[TrainingPipeline] STEP 3: Data Transformation")
|
| 78 |
+
logger.info("=" * 50)
|
| 79 |
+
|
| 80 |
+
transformation = DataTransformation(self.config.data_transformation)
|
| 81 |
+
artifact = transformation.transform(validation_artifact.validated_data_path)
|
| 82 |
+
|
| 83 |
+
return artifact
|
| 84 |
+
|
| 85 |
+
def run_model_training(self, transformation_artifact: DataTransformationArtifact) -> ModelTrainerArtifact:
|
| 86 |
+
"""Execute model training step"""
|
| 87 |
+
logger.info("=" * 50)
|
| 88 |
+
logger.info("[TrainingPipeline] STEP 4: Model Training")
|
| 89 |
+
logger.info("=" * 50)
|
| 90 |
+
|
| 91 |
+
trainer = ModelTrainer(self.config.model_trainer)
|
| 92 |
+
artifact = trainer.train(transformation_artifact.feature_store_path)
|
| 93 |
+
|
| 94 |
+
return artifact
|
| 95 |
+
|
| 96 |
+
def run(self) -> PipelineArtifact:
|
| 97 |
+
"""
|
| 98 |
+
Execute the complete training pipeline.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
PipelineArtifact with all step results
|
| 102 |
+
"""
|
| 103 |
+
start_time = datetime.now()
|
| 104 |
+
logger.info("=" * 60)
|
| 105 |
+
logger.info("[TrainingPipeline] STARTING TRAINING PIPELINE")
|
| 106 |
+
logger.info("=" * 60)
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
# Step 1: Data Ingestion
|
| 110 |
+
ingestion_artifact = self.run_data_ingestion()
|
| 111 |
+
|
| 112 |
+
# Step 2: Data Validation
|
| 113 |
+
validation_artifact = self.run_data_validation(ingestion_artifact)
|
| 114 |
+
|
| 115 |
+
# Step 3: Data Transformation
|
| 116 |
+
transformation_artifact = self.run_data_transformation(validation_artifact)
|
| 117 |
+
|
| 118 |
+
# Step 4: Model Training
|
| 119 |
+
training_artifact = self.run_model_training(transformation_artifact)
|
| 120 |
+
|
| 121 |
+
pipeline_status = "SUCCESS"
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"[TrainingPipeline] Pipeline failed: {e}")
|
| 125 |
+
pipeline_status = f"FAILED: {str(e)}"
|
| 126 |
+
raise
|
| 127 |
+
|
| 128 |
+
finally:
|
| 129 |
+
end_time = datetime.now()
|
| 130 |
+
duration = (end_time - start_time).total_seconds()
|
| 131 |
+
logger.info("=" * 60)
|
| 132 |
+
logger.info(f"[TrainingPipeline] PIPELINE {pipeline_status}")
|
| 133 |
+
logger.info(f"[TrainingPipeline] Duration: {duration:.1f}s")
|
| 134 |
+
logger.info("=" * 60)
|
| 135 |
+
|
| 136 |
+
# Build final artifact
|
| 137 |
+
artifact = PipelineArtifact(
|
| 138 |
+
data_ingestion=ingestion_artifact,
|
| 139 |
+
data_validation=validation_artifact,
|
| 140 |
+
data_transformation=transformation_artifact,
|
| 141 |
+
model_trainer=training_artifact,
|
| 142 |
+
pipeline_run_id=self.run_id,
|
| 143 |
+
pipeline_start_time=start_time.isoformat(),
|
| 144 |
+
pipeline_end_time=end_time.isoformat(),
|
| 145 |
+
pipeline_status=pipeline_status
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
return artifact
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def run_training_pipeline(config: Optional[PipelineConfig] = None) -> PipelineArtifact:
|
| 152 |
+
"""
|
| 153 |
+
Convenience function to run the training pipeline.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
config: Optional pipeline configuration
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
PipelineArtifact with results
|
| 160 |
+
"""
|
| 161 |
+
pipeline = TrainingPipeline(config)
|
| 162 |
+
return pipeline.run()
|
models/anomaly-detection/src/utils/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/utils/__init__.py
|
| 3 |
+
"""
|
| 4 |
+
from .language_detector import LanguageDetector, detect_language, get_detector
|
| 5 |
+
from .vectorizer import MultilingualVectorizer, vectorize_text, get_vectorizer
|
| 6 |
+
from .metrics import (
|
| 7 |
+
calculate_clustering_metrics,
|
| 8 |
+
calculate_anomaly_metrics,
|
| 9 |
+
calculate_optuna_objective,
|
| 10 |
+
format_metrics_report
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"LanguageDetector",
|
| 15 |
+
"detect_language",
|
| 16 |
+
"get_detector",
|
| 17 |
+
"MultilingualVectorizer",
|
| 18 |
+
"vectorize_text",
|
| 19 |
+
"get_vectorizer",
|
| 20 |
+
"calculate_clustering_metrics",
|
| 21 |
+
"calculate_anomaly_metrics",
|
| 22 |
+
"calculate_optuna_objective",
|
| 23 |
+
"format_metrics_report"
|
| 24 |
+
]
|
models/anomaly-detection/src/utils/language_detector.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/utils/language_detector.py
|
| 3 |
+
Language detection using FastText or lingua-py for Sinhala/Tamil/English
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Tuple, Optional
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("language_detector")
|
| 12 |
+
|
| 13 |
+
# Try FastText first, fallback to lingua
|
| 14 |
+
try:
|
| 15 |
+
import fasttext
|
| 16 |
+
fasttext.FastText.eprint = lambda x: None # Suppress warnings
|
| 17 |
+
FASTTEXT_AVAILABLE = True
|
| 18 |
+
except ImportError:
|
| 19 |
+
FASTTEXT_AVAILABLE = False
|
| 20 |
+
logger.warning("FastText not available. Install with: pip install fasttext")
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from lingua import Language, LanguageDetectorBuilder
|
| 24 |
+
LINGUA_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
LINGUA_AVAILABLE = False
|
| 27 |
+
logger.warning("Lingua not available. Install with: pip install lingua-language-detector")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class LanguageDetector:
|
| 31 |
+
"""
|
| 32 |
+
Multilingual language detector supporting Sinhala, Tamil, and English.
|
| 33 |
+
Uses FastText as primary detector with lingua fallback.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# Language code mapping
|
| 37 |
+
LANG_MAP = {
|
| 38 |
+
"en": "english",
|
| 39 |
+
"si": "sinhala",
|
| 40 |
+
"ta": "tamil",
|
| 41 |
+
"__label__en": "english",
|
| 42 |
+
"__label__si": "sinhala",
|
| 43 |
+
"__label__ta": "tamil",
|
| 44 |
+
"ENGLISH": "english",
|
| 45 |
+
"SINHALA": "sinhala",
|
| 46 |
+
"TAMIL": "tamil"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Unicode ranges for script detection
|
| 50 |
+
SINHALA_RANGE = (0x0D80, 0x0DFF)
|
| 51 |
+
TAMIL_RANGE = (0x0B80, 0x0BFF)
|
| 52 |
+
|
| 53 |
+
def __init__(self, models_cache_dir: Optional[str] = None):
|
| 54 |
+
"""
|
| 55 |
+
Initialize language detector.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
models_cache_dir: Directory for cached FastText models
|
| 59 |
+
"""
|
| 60 |
+
self.models_cache_dir = models_cache_dir or str(
|
| 61 |
+
Path(__file__).parent.parent.parent / "models_cache"
|
| 62 |
+
)
|
| 63 |
+
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
|
| 64 |
+
|
| 65 |
+
self.fasttext_model = None
|
| 66 |
+
self.lingua_detector = None
|
| 67 |
+
|
| 68 |
+
self._init_detectors()
|
| 69 |
+
|
| 70 |
+
def _init_detectors(self):
|
| 71 |
+
"""Initialize detection models"""
|
| 72 |
+
# Try FastText
|
| 73 |
+
if FASTTEXT_AVAILABLE:
|
| 74 |
+
model_path = Path(self.models_cache_dir) / "lid.176.bin"
|
| 75 |
+
if model_path.exists():
|
| 76 |
+
try:
|
| 77 |
+
self.fasttext_model = fasttext.load_model(str(model_path))
|
| 78 |
+
logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
|
| 81 |
+
else:
|
| 82 |
+
logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
|
| 83 |
+
logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")
|
| 84 |
+
|
| 85 |
+
# Initialize lingua as fallback
|
| 86 |
+
if LINGUA_AVAILABLE:
|
| 87 |
+
try:
|
| 88 |
+
self.lingua_detector = LanguageDetectorBuilder.from_languages(
|
| 89 |
+
Language.ENGLISH,
|
| 90 |
+
Language.TAMIL,
|
| 91 |
+
# Note: Lingua may not have Sinhala, we'll use script detection
|
| 92 |
+
).build()
|
| 93 |
+
logger.info("[LanguageDetector] Initialized Lingua detector")
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")
|
| 96 |
+
|
| 97 |
+
def _detect_by_script(self, text: str) -> Optional[str]:
|
| 98 |
+
"""
|
| 99 |
+
Detect language by Unicode script analysis.
|
| 100 |
+
More reliable for Sinhala/Tamil which have distinct scripts.
|
| 101 |
+
"""
|
| 102 |
+
sinhala_count = 0
|
| 103 |
+
tamil_count = 0
|
| 104 |
+
latin_count = 0
|
| 105 |
+
|
| 106 |
+
for char in text:
|
| 107 |
+
code = ord(char)
|
| 108 |
+
if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
|
| 109 |
+
sinhala_count += 1
|
| 110 |
+
elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
|
| 111 |
+
tamil_count += 1
|
| 112 |
+
elif char.isalpha() and code < 128:
|
| 113 |
+
latin_count += 1
|
| 114 |
+
|
| 115 |
+
total_alpha = sinhala_count + tamil_count + latin_count
|
| 116 |
+
if total_alpha == 0:
|
| 117 |
+
return None
|
| 118 |
+
|
| 119 |
+
# Threshold-based detection
|
| 120 |
+
if sinhala_count / total_alpha > 0.3:
|
| 121 |
+
return "sinhala"
|
| 122 |
+
if tamil_count / total_alpha > 0.3:
|
| 123 |
+
return "tamil"
|
| 124 |
+
if latin_count / total_alpha > 0.5:
|
| 125 |
+
return "english"
|
| 126 |
+
|
| 127 |
+
return None
|
| 128 |
+
|
| 129 |
+
def detect(self, text: str) -> Tuple[str, float]:
|
| 130 |
+
"""
|
| 131 |
+
Detect language of text.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
text: Input text
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Tuple of (language_code, confidence)
|
| 138 |
+
language_code: 'english', 'sinhala', 'tamil', or 'unknown'
|
| 139 |
+
"""
|
| 140 |
+
if not text or len(text.strip()) < 3:
|
| 141 |
+
return "unknown", 0.0
|
| 142 |
+
|
| 143 |
+
# Clean text
|
| 144 |
+
clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text)
|
| 145 |
+
clean_text = clean_text.strip()
|
| 146 |
+
|
| 147 |
+
if not clean_text:
|
| 148 |
+
return "unknown", 0.0
|
| 149 |
+
|
| 150 |
+
# 1. First try script detection (most reliable for Sinhala/Tamil)
|
| 151 |
+
script_lang = self._detect_by_script(clean_text)
|
| 152 |
+
if script_lang in ["sinhala", "tamil"]:
|
| 153 |
+
return script_lang, 0.95
|
| 154 |
+
|
| 155 |
+
# 2. Try FastText
|
| 156 |
+
if self.fasttext_model:
|
| 157 |
+
try:
|
| 158 |
+
predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
|
| 159 |
+
label = predictions[0][0]
|
| 160 |
+
confidence = predictions[1][0]
|
| 161 |
+
|
| 162 |
+
lang = self.LANG_MAP.get(label, "unknown")
|
| 163 |
+
if lang != "unknown" and confidence > 0.5:
|
| 164 |
+
return lang, float(confidence)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.debug(f"FastText error: {e}")
|
| 167 |
+
|
| 168 |
+
# 3. Try Lingua
|
| 169 |
+
if self.lingua_detector:
|
| 170 |
+
try:
|
| 171 |
+
detected = self.lingua_detector.detect_language_of(clean_text)
|
| 172 |
+
if detected:
|
| 173 |
+
lang = self.LANG_MAP.get(detected.name, "unknown")
|
| 174 |
+
# Lingua doesn't return confidence, estimate based on text
|
| 175 |
+
confidence = 0.8 if len(clean_text) > 20 else 0.6
|
| 176 |
+
return lang, confidence
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.debug(f"Lingua error: {e}")
|
| 179 |
+
|
| 180 |
+
# 4. Fallback to script detection result or default
|
| 181 |
+
if script_lang == "english":
|
| 182 |
+
return "english", 0.7
|
| 183 |
+
|
| 184 |
+
return "english", 0.5 # Default to English
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# Singleton instance
|
| 188 |
+
_detector: Optional[LanguageDetector] = None
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
|
| 192 |
+
"""Get or create singleton detector instance"""
|
| 193 |
+
global _detector
|
| 194 |
+
if _detector is None:
|
| 195 |
+
_detector = LanguageDetector(models_cache_dir)
|
| 196 |
+
return _detector
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def detect_language(text: str) -> Tuple[str, float]:
|
| 200 |
+
"""
|
| 201 |
+
Convenience function for language detection.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
text: Input text
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
Tuple of (language: str, confidence: float)
|
| 208 |
+
"""
|
| 209 |
+
return get_detector().detect(text)
|
models/anomaly-detection/src/utils/metrics.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/utils/metrics.py
|
| 3 |
+
Clustering and anomaly detection metrics for model evaluation
|
| 4 |
+
"""
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, Any, Optional, List
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger("metrics")
|
| 10 |
+
|
| 11 |
+
# Scikit-learn metrics
|
| 12 |
+
try:
|
| 13 |
+
from sklearn.metrics import (
|
| 14 |
+
silhouette_score,
|
| 15 |
+
calinski_harabasz_score,
|
| 16 |
+
davies_bouldin_score,
|
| 17 |
+
adjusted_rand_score,
|
| 18 |
+
normalized_mutual_info_score
|
| 19 |
+
)
|
| 20 |
+
SKLEARN_AVAILABLE = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
SKLEARN_AVAILABLE = False
|
| 23 |
+
logger.warning("scikit-learn not available for metrics")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def calculate_clustering_metrics(
|
| 27 |
+
X: np.ndarray,
|
| 28 |
+
labels: np.ndarray,
|
| 29 |
+
true_labels: Optional[np.ndarray] = None
|
| 30 |
+
) -> Dict[str, float]:
|
| 31 |
+
"""
|
| 32 |
+
Calculate comprehensive clustering quality metrics.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
X: Feature matrix (n_samples, n_features)
|
| 36 |
+
labels: Predicted cluster labels
|
| 37 |
+
true_labels: Optional ground truth labels for supervised metrics
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Dict of metric_name -> metric_value
|
| 41 |
+
"""
|
| 42 |
+
if not SKLEARN_AVAILABLE:
|
| 43 |
+
logger.warning("sklearn not available, returning empty metrics")
|
| 44 |
+
return {}
|
| 45 |
+
|
| 46 |
+
metrics = {}
|
| 47 |
+
|
| 48 |
+
# Filter out noise points (label=-1) for some metrics
|
| 49 |
+
valid_mask = labels >= 0
|
| 50 |
+
n_clusters = len(set(labels[valid_mask]))
|
| 51 |
+
|
| 52 |
+
# Need at least 2 clusters and >1 samples for metrics
|
| 53 |
+
if n_clusters < 2 or np.sum(valid_mask) < 2:
|
| 54 |
+
metrics["n_clusters"] = n_clusters
|
| 55 |
+
metrics["n_noise_points"] = np.sum(labels == -1)
|
| 56 |
+
metrics["error"] = "insufficient_clusters"
|
| 57 |
+
return metrics
|
| 58 |
+
|
| 59 |
+
# Internal metrics (don't need ground truth)
|
| 60 |
+
try:
|
| 61 |
+
# Silhouette Score: -1 (bad) to 1 (good)
|
| 62 |
+
# Measures how similar objects are to their own cluster vs other clusters
|
| 63 |
+
metrics["silhouette_score"] = float(silhouette_score(
|
| 64 |
+
X[valid_mask], labels[valid_mask]
|
| 65 |
+
))
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.debug(f"Silhouette score failed: {e}")
|
| 68 |
+
metrics["silhouette_score"] = None
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
# Calinski-Harabasz Index: Higher is better
|
| 72 |
+
# Ratio of between-cluster dispersion to within-cluster dispersion
|
| 73 |
+
metrics["calinski_harabasz_score"] = float(calinski_harabasz_score(
|
| 74 |
+
X[valid_mask], labels[valid_mask]
|
| 75 |
+
))
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.debug(f"Calinski-Harabasz failed: {e}")
|
| 78 |
+
metrics["calinski_harabasz_score"] = None
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
# Davies-Bouldin Index: Lower is better
|
| 82 |
+
# Average similarity between clusters
|
| 83 |
+
metrics["davies_bouldin_score"] = float(davies_bouldin_score(
|
| 84 |
+
X[valid_mask], labels[valid_mask]
|
| 85 |
+
))
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.debug(f"Davies-Bouldin failed: {e}")
|
| 88 |
+
metrics["davies_bouldin_score"] = None
|
| 89 |
+
|
| 90 |
+
# Cluster statistics
|
| 91 |
+
metrics["n_clusters"] = n_clusters
|
| 92 |
+
metrics["n_samples"] = len(labels)
|
| 93 |
+
metrics["n_noise_points"] = int(np.sum(labels == -1))
|
| 94 |
+
metrics["noise_ratio"] = float(np.sum(labels == -1) / len(labels))
|
| 95 |
+
|
| 96 |
+
# Cluster size statistics
|
| 97 |
+
cluster_sizes = [np.sum(labels == i) for i in range(n_clusters)]
|
| 98 |
+
metrics["min_cluster_size"] = int(min(cluster_sizes)) if cluster_sizes else 0
|
| 99 |
+
metrics["max_cluster_size"] = int(max(cluster_sizes)) if cluster_sizes else 0
|
| 100 |
+
metrics["mean_cluster_size"] = float(np.mean(cluster_sizes)) if cluster_sizes else 0
|
| 101 |
+
|
| 102 |
+
# External metrics (if ground truth provided)
|
| 103 |
+
if true_labels is not None:
|
| 104 |
+
try:
|
| 105 |
+
# Adjusted Rand Index: -1 to 1, 1=perfect, 0=random
|
| 106 |
+
metrics["adjusted_rand_score"] = float(adjusted_rand_score(
|
| 107 |
+
true_labels, labels
|
| 108 |
+
))
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.debug(f"ARI failed: {e}")
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
# Normalized Mutual Information: 0 to 1, 1=perfect agreement
|
| 114 |
+
metrics["normalized_mutual_info"] = float(normalized_mutual_info_score(
|
| 115 |
+
true_labels, labels
|
| 116 |
+
))
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.debug(f"NMI failed: {e}")
|
| 119 |
+
|
| 120 |
+
return metrics
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def calculate_anomaly_metrics(
|
| 124 |
+
labels: np.ndarray,
|
| 125 |
+
predicted_anomalies: np.ndarray,
|
| 126 |
+
true_anomalies: Optional[np.ndarray] = None
|
| 127 |
+
) -> Dict[str, float]:
|
| 128 |
+
"""
|
| 129 |
+
Calculate anomaly detection metrics.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
labels: Cluster labels or -1 for anomalies
|
| 133 |
+
predicted_anomalies: Boolean array of predicted anomaly flags
|
| 134 |
+
true_anomalies: Optional ground truth anomaly flags
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Dict of metric_name -> metric_value
|
| 138 |
+
"""
|
| 139 |
+
metrics = {}
|
| 140 |
+
|
| 141 |
+
n_samples = len(labels)
|
| 142 |
+
n_predicted_anomalies = int(np.sum(predicted_anomalies))
|
| 143 |
+
|
| 144 |
+
metrics["n_samples"] = n_samples
|
| 145 |
+
metrics["n_predicted_anomalies"] = n_predicted_anomalies
|
| 146 |
+
metrics["anomaly_rate"] = float(n_predicted_anomalies / n_samples) if n_samples > 0 else 0
|
| 147 |
+
|
| 148 |
+
# If ground truth available, calculate precision/recall
|
| 149 |
+
if true_anomalies is not None:
|
| 150 |
+
n_true_anomalies = int(np.sum(true_anomalies))
|
| 151 |
+
|
| 152 |
+
# True positives: predicted AND actual anomalies
|
| 153 |
+
tp = int(np.sum(predicted_anomalies & true_anomalies))
|
| 154 |
+
# False positives: predicted anomaly but not actual
|
| 155 |
+
fp = int(np.sum(predicted_anomalies & ~true_anomalies))
|
| 156 |
+
# False negatives: not predicted but actual anomaly
|
| 157 |
+
fn = int(np.sum(~predicted_anomalies & true_anomalies))
|
| 158 |
+
# True negatives
|
| 159 |
+
tn = int(np.sum(~predicted_anomalies & ~true_anomalies))
|
| 160 |
+
|
| 161 |
+
metrics["true_positives"] = tp
|
| 162 |
+
metrics["false_positives"] = fp
|
| 163 |
+
metrics["false_negatives"] = fn
|
| 164 |
+
metrics["true_negatives"] = tn
|
| 165 |
+
|
| 166 |
+
# Precision: TP / (TP + FP)
|
| 167 |
+
metrics["precision"] = float(tp / (tp + fp)) if (tp + fp) > 0 else 0
|
| 168 |
+
|
| 169 |
+
# Recall: TP / (TP + FN)
|
| 170 |
+
metrics["recall"] = float(tp / (tp + fn)) if (tp + fn) > 0 else 0
|
| 171 |
+
|
| 172 |
+
# F1 Score
|
| 173 |
+
if metrics["precision"] + metrics["recall"] > 0:
|
| 174 |
+
metrics["f1_score"] = float(
|
| 175 |
+
2 * metrics["precision"] * metrics["recall"] /
|
| 176 |
+
(metrics["precision"] + metrics["recall"])
|
| 177 |
+
)
|
| 178 |
+
else:
|
| 179 |
+
metrics["f1_score"] = 0
|
| 180 |
+
|
| 181 |
+
return metrics
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def calculate_optuna_objective(
|
| 185 |
+
X: np.ndarray,
|
| 186 |
+
labels: np.ndarray,
|
| 187 |
+
objective_type: str = "silhouette"
|
| 188 |
+
) -> float:
|
| 189 |
+
"""
|
| 190 |
+
Calculate objective value for Optuna optimization.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
X: Feature matrix
|
| 194 |
+
labels: Predicted labels
|
| 195 |
+
objective_type: 'silhouette', 'calinski', or 'combined'
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Objective value (higher is better)
|
| 199 |
+
"""
|
| 200 |
+
metrics = calculate_clustering_metrics(X, labels)
|
| 201 |
+
|
| 202 |
+
# Check for errors
|
| 203 |
+
if "error" in metrics:
|
| 204 |
+
return -1.0 # Return bad score for failed clustering
|
| 205 |
+
|
| 206 |
+
if objective_type == "silhouette":
|
| 207 |
+
score = metrics.get("silhouette_score")
|
| 208 |
+
return score if score is not None else -1.0
|
| 209 |
+
|
| 210 |
+
elif objective_type == "calinski":
|
| 211 |
+
score = metrics.get("calinski_harabasz_score")
|
| 212 |
+
# Normalize to 0-1 range (approximate)
|
| 213 |
+
return min(score / 1000, 1.0) if score is not None else -1.0
|
| 214 |
+
|
| 215 |
+
elif objective_type == "combined":
|
| 216 |
+
# Weighted combination of metrics
|
| 217 |
+
silhouette = metrics.get("silhouette_score", -1)
|
| 218 |
+
calinski = min(metrics.get("calinski_harabasz_score", 0) / 1000, 1)
|
| 219 |
+
davies = metrics.get("davies_bouldin_score", 10)
|
| 220 |
+
|
| 221 |
+
# Davies-Bouldin is lower=better, invert it
|
| 222 |
+
davies_inv = 1 / (1 + davies) if davies is not None else 0
|
| 223 |
+
|
| 224 |
+
# Weighted combination
|
| 225 |
+
combined = (0.4 * silhouette + 0.3 * calinski + 0.3 * davies_inv)
|
| 226 |
+
return float(combined)
|
| 227 |
+
|
| 228 |
+
return -1.0
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def format_metrics_report(metrics: Dict[str, Any]) -> str:
|
| 232 |
+
"""
|
| 233 |
+
Format metrics dictionary as a readable report.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
metrics: Dictionary of metric values
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Formatted string report
|
| 240 |
+
"""
|
| 241 |
+
lines = ["=" * 50]
|
| 242 |
+
lines.append("CLUSTERING METRICS REPORT")
|
| 243 |
+
lines.append("=" * 50)
|
| 244 |
+
|
| 245 |
+
for key, value in metrics.items():
|
| 246 |
+
if value is None:
|
| 247 |
+
value_str = "N/A"
|
| 248 |
+
elif isinstance(value, float):
|
| 249 |
+
value_str = f"{value:.4f}"
|
| 250 |
+
else:
|
| 251 |
+
value_str = str(value)
|
| 252 |
+
|
| 253 |
+
lines.append(f"{key:30s}: {value_str}")
|
| 254 |
+
|
| 255 |
+
lines.append("=" * 50)
|
| 256 |
+
return "\n".join(lines)
|
models/anomaly-detection/src/utils/vectorizer.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/anomaly-detection/src/utils/vectorizer.py
|
| 3 |
+
Text vectorization using language-specific BERT models (downloaded locally)
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, List, Optional, Tuple
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("vectorizer")
|
| 12 |
+
|
| 13 |
+
# Transformers
|
| 14 |
+
try:
|
| 15 |
+
from transformers import AutoTokenizer, AutoModel
|
| 16 |
+
import torch
|
| 17 |
+
TRANSFORMERS_AVAILABLE = True
|
| 18 |
+
except ImportError:
|
| 19 |
+
TRANSFORMERS_AVAILABLE = False
|
| 20 |
+
logger.warning("Transformers not available. Install with: pip install transformers torch")
|
| 21 |
+
|
| 22 |
+
# Sentence Transformers for fallback
|
| 23 |
+
try:
|
| 24 |
+
from sentence_transformers import SentenceTransformer
|
| 25 |
+
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
| 26 |
+
except ImportError:
|
| 27 |
+
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class MultilingualVectorizer:
|
| 31 |
+
"""
|
| 32 |
+
Vectorizer using language-specific BERT models.
|
| 33 |
+
Downloads and caches models locally from HuggingFace.
|
| 34 |
+
|
| 35 |
+
Models:
|
| 36 |
+
- English: distilbert-base-uncased (fast, accurate)
|
| 37 |
+
- Sinhala: keshan/SinhalaBERTo (specialized)
|
| 38 |
+
- Tamil: l3cube-pune/tamil-bert (specialized)
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
MODEL_MAP = {
|
| 42 |
+
"english": "distilbert-base-uncased",
|
| 43 |
+
"sinhala": "keshan/SinhalaBERTo",
|
| 44 |
+
"tamil": "l3cube-pune/tamil-bert"
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
def __init__(self, models_cache_dir: Optional[str] = None, device: Optional[str] = None):
|
| 48 |
+
"""
|
| 49 |
+
Initialize the multilingual vectorizer.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
models_cache_dir: Directory to cache downloaded models
|
| 53 |
+
device: 'cuda' or 'cpu' (auto-detected if None)
|
| 54 |
+
"""
|
| 55 |
+
self.models_cache_dir = models_cache_dir or str(
|
| 56 |
+
Path(__file__).parent.parent.parent / "models_cache"
|
| 57 |
+
)
|
| 58 |
+
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
|
| 59 |
+
|
| 60 |
+
# Set cache dir for HuggingFace
|
| 61 |
+
os.environ["TRANSFORMERS_CACHE"] = self.models_cache_dir
|
| 62 |
+
os.environ["HF_HOME"] = self.models_cache_dir
|
| 63 |
+
|
| 64 |
+
# Auto-detect device
|
| 65 |
+
if device is None:
|
| 66 |
+
if TRANSFORMERS_AVAILABLE and torch.cuda.is_available():
|
| 67 |
+
self.device = "cuda"
|
| 68 |
+
else:
|
| 69 |
+
self.device = "cpu"
|
| 70 |
+
else:
|
| 71 |
+
self.device = device
|
| 72 |
+
|
| 73 |
+
logger.info(f"[Vectorizer] Using device: {self.device}")
|
| 74 |
+
|
| 75 |
+
# Lazy load models
|
| 76 |
+
self.models: Dict[str, Tuple] = {} # {lang: (tokenizer, model)}
|
| 77 |
+
self.fallback_model = None
|
| 78 |
+
|
| 79 |
+
def _load_model(self, language: str) -> Tuple:
|
| 80 |
+
"""
|
| 81 |
+
Load language-specific model from cache or download.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
Tuple of (tokenizer, model)
|
| 85 |
+
"""
|
| 86 |
+
if language in self.models:
|
| 87 |
+
return self.models[language]
|
| 88 |
+
|
| 89 |
+
model_name = self.MODEL_MAP.get(language, self.MODEL_MAP["english"])
|
| 90 |
+
|
| 91 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 92 |
+
raise RuntimeError("Transformers library not available")
|
| 93 |
+
|
| 94 |
+
logger.info(f"[Vectorizer] Loading model: {model_name}")
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 98 |
+
model_name,
|
| 99 |
+
cache_dir=self.models_cache_dir
|
| 100 |
+
)
|
| 101 |
+
model = AutoModel.from_pretrained(
|
| 102 |
+
model_name,
|
| 103 |
+
cache_dir=self.models_cache_dir
|
| 104 |
+
).to(self.device)
|
| 105 |
+
model.eval()
|
| 106 |
+
|
| 107 |
+
self.models[language] = (tokenizer, model)
|
| 108 |
+
logger.info(f"[Vectorizer] ✓ Loaded {model_name} ({language})")
|
| 109 |
+
return tokenizer, model
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"[Vectorizer] Failed to load {model_name}: {e}")
|
| 113 |
+
# Fallback to English model
|
| 114 |
+
if language != "english":
|
| 115 |
+
logger.info("[Vectorizer] Falling back to English model")
|
| 116 |
+
return self._load_model("english")
|
| 117 |
+
raise
|
| 118 |
+
|
| 119 |
+
def _get_embedding(self, text: str, tokenizer, model) -> np.ndarray:
|
| 120 |
+
"""
|
| 121 |
+
Get embedding vector using mean pooling.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
text: Input text
|
| 125 |
+
tokenizer: HuggingFace tokenizer
|
| 126 |
+
model: HuggingFace model
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
768-dim numpy array
|
| 130 |
+
"""
|
| 131 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 132 |
+
raise RuntimeError("Transformers not available")
|
| 133 |
+
|
| 134 |
+
# Tokenize
|
| 135 |
+
inputs = tokenizer(
|
| 136 |
+
text,
|
| 137 |
+
return_tensors="pt",
|
| 138 |
+
truncation=True,
|
| 139 |
+
max_length=512,
|
| 140 |
+
padding=True
|
| 141 |
+
).to(self.device)
|
| 142 |
+
|
| 143 |
+
# Get embeddings
|
| 144 |
+
with torch.no_grad():
|
| 145 |
+
outputs = model(**inputs)
|
| 146 |
+
|
| 147 |
+
# Mean pooling over sequence length
|
| 148 |
+
attention_mask = inputs["attention_mask"]
|
| 149 |
+
hidden_states = outputs.last_hidden_state
|
| 150 |
+
|
| 151 |
+
# Mask and average
|
| 152 |
+
mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
|
| 153 |
+
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
|
| 154 |
+
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
|
| 155 |
+
mean_embedding = sum_embeddings / sum_mask
|
| 156 |
+
|
| 157 |
+
return mean_embedding.cpu().numpy().flatten()
|
| 158 |
+
|
| 159 |
+
def vectorize(self, text: str, language: str = "english") -> np.ndarray:
|
| 160 |
+
"""
|
| 161 |
+
Convert text to vector embedding.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
text: Input text
|
| 165 |
+
language: 'english', 'sinhala', 'tamil', or 'unknown'
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
768-dim numpy array
|
| 169 |
+
"""
|
| 170 |
+
if not text or not text.strip():
|
| 171 |
+
return np.zeros(768)
|
| 172 |
+
|
| 173 |
+
# Map unknown to english
|
| 174 |
+
if language == "unknown":
|
| 175 |
+
language = "english"
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
tokenizer, model = self._load_model(language)
|
| 179 |
+
return self._get_embedding(text, tokenizer, model)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.error(f"[Vectorizer] Error vectorizing: {e}")
|
| 182 |
+
# Return zeros as fallback
|
| 183 |
+
return np.zeros(768)
|
| 184 |
+
|
| 185 |
+
def vectorize_batch(
|
| 186 |
+
self,
|
| 187 |
+
texts: List[str],
|
| 188 |
+
languages: Optional[List[str]] = None
|
| 189 |
+
) -> np.ndarray:
|
| 190 |
+
"""
|
| 191 |
+
Batch vectorization for multiple texts.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
texts: List of text strings
|
| 195 |
+
languages: Optional list of language codes (same length as texts)
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
numpy array of shape (n_texts, 768)
|
| 199 |
+
"""
|
| 200 |
+
if languages is None:
|
| 201 |
+
languages = ["english"] * len(texts)
|
| 202 |
+
|
| 203 |
+
embeddings = []
|
| 204 |
+
for text, lang in zip(texts, languages):
|
| 205 |
+
emb = self.vectorize(text, lang)
|
| 206 |
+
embeddings.append(emb)
|
| 207 |
+
|
| 208 |
+
return np.array(embeddings)
|
| 209 |
+
|
| 210 |
+
def download_all_models(self):
|
| 211 |
+
"""Pre-download all language models"""
|
| 212 |
+
for language in self.MODEL_MAP.keys():
|
| 213 |
+
try:
|
| 214 |
+
logger.info(f"[Vectorizer] Pre-downloading {language} model...")
|
| 215 |
+
self._load_model(language)
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logger.warning(f"[Vectorizer] Failed to download {language}: {e}")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# Singleton instance
|
| 221 |
+
_vectorizer: Optional[MultilingualVectorizer] = None
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def get_vectorizer(models_cache_dir: Optional[str] = None) -> MultilingualVectorizer:
|
| 225 |
+
"""Get or create singleton vectorizer instance"""
|
| 226 |
+
global _vectorizer
|
| 227 |
+
if _vectorizer is None:
|
| 228 |
+
_vectorizer = MultilingualVectorizer(models_cache_dir)
|
| 229 |
+
return _vectorizer
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def vectorize_text(text: str, language: str = "english") -> np.ndarray:
|
| 233 |
+
"""
|
| 234 |
+
Convenience function for text vectorization.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
text: Input text
|
| 238 |
+
language: Language code
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
768-dim numpy array
|
| 242 |
+
"""
|
| 243 |
+
return get_vectorizer().vectorize(text, language)
|
models/anomaly-detection/tests/dags/test_dag_example.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example DAGs test. This test ensures that all Dags have tags, retries set to two, and no import errors. This is an example pytest and may not be fit the context of your DAGs. Feel free to add and remove tests."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from contextlib import contextmanager
|
| 6 |
+
import pytest
|
| 7 |
+
from airflow.models import DagBag
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@contextmanager
|
| 11 |
+
def suppress_logging(namespace):
|
| 12 |
+
logger = logging.getLogger(namespace)
|
| 13 |
+
old_value = logger.disabled
|
| 14 |
+
logger.disabled = True
|
| 15 |
+
try:
|
| 16 |
+
yield
|
| 17 |
+
finally:
|
| 18 |
+
logger.disabled = old_value
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_import_errors():
|
| 22 |
+
"""
|
| 23 |
+
Generate a tuple for import errors in the dag bag
|
| 24 |
+
"""
|
| 25 |
+
with suppress_logging("airflow"):
|
| 26 |
+
dag_bag = DagBag(include_examples=False)
|
| 27 |
+
|
| 28 |
+
def strip_path_prefix(path):
|
| 29 |
+
return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
|
| 30 |
+
|
| 31 |
+
# prepend "(None,None)" to ensure that a test object is always created even if it's a no op.
|
| 32 |
+
return [(None, None)] + [
|
| 33 |
+
(strip_path_prefix(k), v.strip()) for k, v in dag_bag.import_errors.items()
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def get_dags():
|
| 38 |
+
"""
|
| 39 |
+
Generate a tuple of dag_id, <DAG objects> in the DagBag
|
| 40 |
+
"""
|
| 41 |
+
with suppress_logging("airflow"):
|
| 42 |
+
dag_bag = DagBag(include_examples=False)
|
| 43 |
+
|
| 44 |
+
def strip_path_prefix(path):
|
| 45 |
+
return os.path.relpath(path, os.environ.get("AIRFLOW_HOME"))
|
| 46 |
+
|
| 47 |
+
return [(k, v, strip_path_prefix(v.fileloc)) for k, v in dag_bag.dags.items()]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@pytest.mark.parametrize(
|
| 51 |
+
"rel_path,rv", get_import_errors(), ids=[x[0] for x in get_import_errors()]
|
| 52 |
+
)
|
| 53 |
+
def test_file_imports(rel_path, rv):
|
| 54 |
+
"""Test for import errors on a file"""
|
| 55 |
+
if rel_path and rv:
|
| 56 |
+
raise Exception(f"{rel_path} failed to import with message \n {rv}")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
APPROVED_TAGS = {}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@pytest.mark.parametrize(
|
| 63 |
+
"dag_id,dag,fileloc", get_dags(), ids=[x[2] for x in get_dags()]
|
| 64 |
+
)
|
| 65 |
+
def test_dag_tags(dag_id, dag, fileloc):
|
| 66 |
+
"""
|
| 67 |
+
test if a DAG is tagged and if those TAGs are in the approved list
|
| 68 |
+
"""
|
| 69 |
+
assert dag.tags, f"{dag_id} in {fileloc} has no tags"
|
| 70 |
+
if APPROVED_TAGS:
|
| 71 |
+
assert not set(dag.tags) - APPROVED_TAGS
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@pytest.mark.parametrize(
|
| 75 |
+
"dag_id,dag, fileloc", get_dags(), ids=[x[2] for x in get_dags()]
|
| 76 |
+
)
|
| 77 |
+
def test_dag_retries(dag_id, dag, fileloc):
|
| 78 |
+
"""
|
| 79 |
+
test if a DAG has retries set
|
| 80 |
+
"""
|
| 81 |
+
assert (
|
| 82 |
+
dag.default_args.get("retries", None) >= 2
|
| 83 |
+
), f"{dag_id} in {fileloc} must have task retries >= 2."
|
models/currency-volatility-prediction/.github/workflows/main.yaml
ADDED
|
File without changes
|
models/currency-volatility-prediction/.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
.env
|
models/currency-volatility-prediction/.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
models/currency-volatility-prediction/Dockerfile
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim-bookworm
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY . /app
|
| 4 |
+
|
| 5 |
+
RUN apt update -y && apt install awscli -y
|
| 6 |
+
|
| 7 |
+
RUN apt-get update && pip install -r requirements.txt
|
| 8 |
+
CMD ["python3", "app.py"]
|
models/currency-volatility-prediction/README.md
ADDED
|
File without changes
|
models/currency-volatility-prediction/app.py
ADDED
|
File without changes
|
models/currency-volatility-prediction/dags/currency_prediction_dag.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/currency-volatility-prediction/dags/currency_prediction_dag.py
|
| 3 |
+
Airflow DAG for daily USD/LKR currency prediction
|
| 4 |
+
Runs at 4:00 AM IST daily
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add paths for imports
|
| 12 |
+
PIPELINE_ROOT = Path(__file__).parent.parent
|
| 13 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 14 |
+
|
| 15 |
+
from airflow import DAG
|
| 16 |
+
from airflow.operators.python import PythonOperator
|
| 17 |
+
|
| 18 |
+
# Load environment variables from root .env
|
| 19 |
+
try:
|
| 20 |
+
from dotenv import load_dotenv
|
| 21 |
+
# Path: dags/ -> currency-volatility-prediction/ -> models/ -> root/
|
| 22 |
+
env_path = Path(__file__).parent.parent.parent.parent / ".env"
|
| 23 |
+
if env_path.exists():
|
| 24 |
+
load_dotenv(env_path)
|
| 25 |
+
print(f"[MLflow] ✓ Loaded env from {env_path}")
|
| 26 |
+
except ImportError:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Default arguments
|
| 31 |
+
default_args = {
|
| 32 |
+
"owner": "modelx",
|
| 33 |
+
"depends_on_past": False,
|
| 34 |
+
"email_on_failure": False,
|
| 35 |
+
"email_on_retry": False,
|
| 36 |
+
"retries": 2,
|
| 37 |
+
"retry_delay": timedelta(minutes=5),
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def ingest_data(**context):
|
| 42 |
+
"""Task: Ingest currency data from yfinance."""
|
| 43 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 44 |
+
from entity.config_entity import DataIngestionConfig
|
| 45 |
+
|
| 46 |
+
print("[CURRENCY DAG] Starting data ingestion...")
|
| 47 |
+
|
| 48 |
+
config = DataIngestionConfig(history_period="2y")
|
| 49 |
+
ingestion = CurrencyDataIngestion(config)
|
| 50 |
+
|
| 51 |
+
# Check if we have recent data
|
| 52 |
+
try:
|
| 53 |
+
df = ingestion.load_existing()
|
| 54 |
+
latest_date = df["date"].max()
|
| 55 |
+
if isinstance(latest_date, str):
|
| 56 |
+
latest_date = datetime.strptime(latest_date, "%Y-%m-%d")
|
| 57 |
+
|
| 58 |
+
days_old = (datetime.now() - latest_date).days
|
| 59 |
+
|
| 60 |
+
if days_old < 1:
|
| 61 |
+
print(f"[CURRENCY DAG] Data is current ({days_old} days old)")
|
| 62 |
+
context["ti"].xcom_push(key="data_path", value=str(ingestion.config.raw_data_dir))
|
| 63 |
+
return str(ingestion.config.raw_data_dir)
|
| 64 |
+
except FileNotFoundError:
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
# Full ingestion
|
| 68 |
+
data_path = ingestion.ingest_all()
|
| 69 |
+
context["ti"].xcom_push(key="data_path", value=data_path)
|
| 70 |
+
|
| 71 |
+
print(f"[CURRENCY DAG] ✓ Data saved to {data_path}")
|
| 72 |
+
return data_path
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def train_model(**context):
|
| 76 |
+
"""Task: Train GRU model."""
|
| 77 |
+
from components.model_trainer import CurrencyGRUTrainer
|
| 78 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 79 |
+
from entity.config_entity import ModelTrainerConfig
|
| 80 |
+
|
| 81 |
+
print("[CURRENCY DAG] Starting model training...")
|
| 82 |
+
|
| 83 |
+
# Load data
|
| 84 |
+
ingestion = CurrencyDataIngestion()
|
| 85 |
+
df = ingestion.load_existing()
|
| 86 |
+
|
| 87 |
+
print(f"[CURRENCY DAG] Loaded {len(df)} records")
|
| 88 |
+
|
| 89 |
+
# Train
|
| 90 |
+
config = ModelTrainerConfig(
|
| 91 |
+
epochs=100,
|
| 92 |
+
batch_size=16,
|
| 93 |
+
early_stopping_patience=15
|
| 94 |
+
)
|
| 95 |
+
trainer = CurrencyGRUTrainer(config)
|
| 96 |
+
|
| 97 |
+
results = trainer.train(df=df, use_mlflow=True)
|
| 98 |
+
|
| 99 |
+
print(f"[CURRENCY DAG] ✓ Training complete!")
|
| 100 |
+
print(f" MAE: {results['test_mae']:.4f} LKR")
|
| 101 |
+
print(f" Direction Accuracy: {results['direction_accuracy']*100:.1f}%")
|
| 102 |
+
|
| 103 |
+
context["ti"].xcom_push(key="model_path", value=results["model_path"])
|
| 104 |
+
return results
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def generate_prediction(**context):
|
| 108 |
+
"""Task: Generate next-day prediction."""
|
| 109 |
+
from components.predictor import CurrencyPredictor
|
| 110 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 111 |
+
|
| 112 |
+
print("[CURRENCY DAG] Generating prediction...")
|
| 113 |
+
|
| 114 |
+
predictor = CurrencyPredictor()
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
# Load latest data
|
| 118 |
+
ingestion = CurrencyDataIngestion()
|
| 119 |
+
df = ingestion.load_existing()
|
| 120 |
+
|
| 121 |
+
# Generate prediction
|
| 122 |
+
prediction = predictor.predict(df)
|
| 123 |
+
|
| 124 |
+
except FileNotFoundError:
|
| 125 |
+
# Model not trained, use fallback
|
| 126 |
+
print("[CURRENCY DAG] Model not trained, using fallback")
|
| 127 |
+
prediction = predictor.generate_fallback_prediction()
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"[CURRENCY DAG] Error predicting: {e}")
|
| 130 |
+
prediction = predictor.generate_fallback_prediction()
|
| 131 |
+
|
| 132 |
+
# Save prediction
|
| 133 |
+
output_path = predictor.save_prediction(prediction)
|
| 134 |
+
|
| 135 |
+
print(f"[CURRENCY DAG] ✓ Prediction generated!")
|
| 136 |
+
print(f" Current: {prediction['current_rate']} LKR/USD")
|
| 137 |
+
print(f" Predicted: {prediction['predicted_rate']} LKR/USD")
|
| 138 |
+
print(f" Change: {prediction['expected_change_pct']:+.2f}%")
|
| 139 |
+
print(f" Direction: {prediction['direction']}")
|
| 140 |
+
|
| 141 |
+
context["ti"].xcom_push(key="prediction_path", value=output_path)
|
| 142 |
+
return prediction
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def publish_prediction(**context):
|
| 146 |
+
"""Task: Log prediction summary."""
|
| 147 |
+
prediction = context["ti"].xcom_pull(task_ids="generate_prediction")
|
| 148 |
+
|
| 149 |
+
if prediction:
|
| 150 |
+
print("\n" + "="*50)
|
| 151 |
+
print(f"USD/LKR CURRENCY PREDICTION")
|
| 152 |
+
print("="*50)
|
| 153 |
+
print(f"Prediction for: {prediction.get('prediction_date')}")
|
| 154 |
+
print(f"Current Rate: {prediction.get('current_rate')} LKR/USD")
|
| 155 |
+
print(f"Predicted Rate: {prediction.get('predicted_rate')} LKR/USD")
|
| 156 |
+
print(f"Expected Change: {prediction.get('expected_change_pct'):+.3f}%")
|
| 157 |
+
print(f"Direction: {prediction.get('direction_emoji')} {prediction.get('direction')}")
|
| 158 |
+
print(f"Volatility: {prediction.get('volatility_class')}")
|
| 159 |
+
if prediction.get('is_fallback'):
|
| 160 |
+
print("⚠️ Using fallback model")
|
| 161 |
+
print("="*50 + "\n")
|
| 162 |
+
|
| 163 |
+
return True
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# Define DAG
|
| 167 |
+
with DAG(
|
| 168 |
+
dag_id="currency_prediction_daily",
|
| 169 |
+
default_args=default_args,
|
| 170 |
+
description="Daily USD/LKR currency prediction using GRU neural network",
|
| 171 |
+
schedule_interval="0 4 * * *", # 4:00 AM daily (IST is UTC+5:30)
|
| 172 |
+
start_date=datetime(2024, 12, 1),
|
| 173 |
+
catchup=False,
|
| 174 |
+
tags=["currency", "ml", "prediction", "gru", "forex"],
|
| 175 |
+
) as dag:
|
| 176 |
+
|
| 177 |
+
# Task 1: Ingest Data
|
| 178 |
+
task_ingest = PythonOperator(
|
| 179 |
+
task_id="ingest_data",
|
| 180 |
+
python_callable=ingest_data,
|
| 181 |
+
provide_context=True,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# Task 2: Train Model
|
| 185 |
+
task_train = PythonOperator(
|
| 186 |
+
task_id="train_model",
|
| 187 |
+
python_callable=train_model,
|
| 188 |
+
provide_context=True,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Task 3: Generate Prediction
|
| 192 |
+
task_predict = PythonOperator(
|
| 193 |
+
task_id="generate_prediction",
|
| 194 |
+
python_callable=generate_prediction,
|
| 195 |
+
provide_context=True,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# Task 4: Publish Prediction
|
| 199 |
+
task_publish = PythonOperator(
|
| 200 |
+
task_id="publish_prediction",
|
| 201 |
+
python_callable=publish_prediction,
|
| 202 |
+
provide_context=True,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Dependencies
|
| 206 |
+
task_ingest >> task_train >> task_predict >> task_publish
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
print("Currency Prediction DAG loaded successfully")
|
| 211 |
+
print(f"Schedule: Daily at 4:00 AM")
|
| 212 |
+
print(f"Tasks: {[t.task_id for t in dag.tasks]}")
|
models/currency-volatility-prediction/data_schema/schema.yaml
ADDED
|
File without changes
|
models/currency-volatility-prediction/main.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models/currency-volatility-prediction/main.py
|
| 3 |
+
Entry point for Currency Prediction Pipeline
|
| 4 |
+
Can run data collection, training, or prediction independently
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
# Setup paths
|
| 14 |
+
PIPELINE_ROOT = Path(__file__).parent
|
| 15 |
+
sys.path.insert(0, str(PIPELINE_ROOT / "src"))
|
| 16 |
+
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 20 |
+
)
|
| 21 |
+
logger = logging.getLogger("currency_prediction")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def run_data_ingestion(period: str = "2y"):
|
| 25 |
+
"""Run data ingestion from yfinance."""
|
| 26 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 27 |
+
from entity.config_entity import DataIngestionConfig
|
| 28 |
+
|
| 29 |
+
logger.info(f"Starting data ingestion ({period})...")
|
| 30 |
+
|
| 31 |
+
config = DataIngestionConfig(history_period=period)
|
| 32 |
+
ingestion = CurrencyDataIngestion(config)
|
| 33 |
+
|
| 34 |
+
data_path = ingestion.ingest_all()
|
| 35 |
+
|
| 36 |
+
df = ingestion.load_existing(data_path)
|
| 37 |
+
|
| 38 |
+
logger.info("Data Ingestion Complete!")
|
| 39 |
+
logger.info(f"Total records: {len(df)}")
|
| 40 |
+
logger.info(f"Features: {len(df.columns)}")
|
| 41 |
+
logger.info(f"Date range: {df['date'].min()} to {df['date'].max()}")
|
| 42 |
+
logger.info(f"Latest rate: {df['close'].iloc[-1]:.2f} LKR/USD")
|
| 43 |
+
|
| 44 |
+
return data_path
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def run_training(epochs: int = 100):
|
| 48 |
+
"""Run GRU model training."""
|
| 49 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 50 |
+
from components.model_trainer import CurrencyGRUTrainer
|
| 51 |
+
from entity.config_entity import ModelTrainerConfig
|
| 52 |
+
|
| 53 |
+
logger.info("Starting model training...")
|
| 54 |
+
|
| 55 |
+
# Load data
|
| 56 |
+
ingestion = CurrencyDataIngestion()
|
| 57 |
+
df = ingestion.load_existing()
|
| 58 |
+
|
| 59 |
+
logger.info(f"Loaded {len(df)} records with {len(df.columns)} features")
|
| 60 |
+
|
| 61 |
+
# Train
|
| 62 |
+
config = ModelTrainerConfig(epochs=epochs)
|
| 63 |
+
trainer = CurrencyGRUTrainer(config)
|
| 64 |
+
|
| 65 |
+
results = trainer.train(df=df, use_mlflow=True)
|
| 66 |
+
|
| 67 |
+
logger.info(f"\nTraining Results:")
|
| 68 |
+
logger.info(f" MAE: {results['test_mae']:.4f} LKR")
|
| 69 |
+
logger.info(f" RMSE: {results['rmse']:.4f} LKR")
|
| 70 |
+
logger.info(f" Direction Accuracy: {results['direction_accuracy']*100:.1f}%")
|
| 71 |
+
logger.info(f" Epochs: {results['epochs_trained']}")
|
| 72 |
+
logger.info(f" Model saved: {results['model_path']}")
|
| 73 |
+
|
| 74 |
+
return results
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def run_prediction():
|
| 78 |
+
"""Run prediction for next day."""
|
| 79 |
+
from components.data_ingestion import CurrencyDataIngestion
|
| 80 |
+
from components.predictor import CurrencyPredictor
|
| 81 |
+
|
| 82 |
+
logger.info("Generating prediction...")
|
| 83 |
+
|
| 84 |
+
predictor = CurrencyPredictor()
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
ingestion = CurrencyDataIngestion()
|
| 88 |
+
df = ingestion.load_existing()
|
| 89 |
+
prediction = predictor.predict(df)
|
| 90 |
+
except FileNotFoundError:
|
| 91 |
+
logger.warning("Model not trained, using fallback")
|
| 92 |
+
prediction = predictor.generate_fallback_prediction()
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Error: {e}")
|
| 95 |
+
prediction = predictor.generate_fallback_prediction()
|
| 96 |
+
|
| 97 |
+
output_path = predictor.save_prediction(prediction)
|
| 98 |
+
|
| 99 |
+
# Display
|
| 100 |
+
logger.info(f"\n{'='*50}")
|
| 101 |
+
logger.info(f"USD/LKR PREDICTION FOR {prediction['prediction_date']}")
|
| 102 |
+
logger.info(f"{'='*50}")
|
| 103 |
+
logger.info(f"Current Rate: {prediction['current_rate']:.2f} LKR/USD")
|
| 104 |
+
logger.info(f"Predicted Rate: {prediction['predicted_rate']:.2f} LKR/USD")
|
| 105 |
+
logger.info(f"Expected Change: {prediction['expected_change_pct']:+.3f}%")
|
| 106 |
+
logger.info(f"Direction: {prediction['direction_emoji']} LKR {prediction['direction']}")
|
| 107 |
+
logger.info(f"Volatility: {prediction['volatility_class']}")
|
| 108 |
+
|
| 109 |
+
if prediction.get('weekly_trend'):
|
| 110 |
+
logger.info(f"Weekly Trend: {prediction['weekly_trend']:+.2f}%")
|
| 111 |
+
if prediction.get('monthly_trend'):
|
| 112 |
+
logger.info(f"Monthly Trend: {prediction['monthly_trend']:+.2f}%")
|
| 113 |
+
|
| 114 |
+
logger.info(f"{'='*50}")
|
| 115 |
+
logger.info(f"Saved to: {output_path}")
|
| 116 |
+
|
| 117 |
+
return prediction
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def run_full_pipeline():
|
| 121 |
+
"""Run the complete pipeline: ingest → train → predict."""
|
| 122 |
+
logger.info("=" * 60)
|
| 123 |
+
logger.info("CURRENCY PREDICTION PIPELINE - FULL RUN")
|
| 124 |
+
logger.info("=" * 60)
|
| 125 |
+
|
| 126 |
+
# Step 1: Data Ingestion
|
| 127 |
+
try:
|
| 128 |
+
run_data_ingestion(period="2y")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.error(f"Data ingestion failed: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
# Step 2: Training
|
| 134 |
+
try:
|
| 135 |
+
run_training(epochs=100)
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Training failed: {e}")
|
| 138 |
+
|
| 139 |
+
# Step 3: Prediction
|
| 140 |
+
prediction = run_prediction()
|
| 141 |
+
|
| 142 |
+
logger.info("=" * 60)
|
| 143 |
+
logger.info("PIPELINE COMPLETE!")
|
| 144 |
+
logger.info("=" * 60)
|
| 145 |
+
|
| 146 |
+
return prediction
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
parser = argparse.ArgumentParser(description="Currency Prediction Pipeline")
|
| 151 |
+
parser.add_argument(
|
| 152 |
+
"--mode",
|
| 153 |
+
choices=["ingest", "train", "predict", "full"],
|
| 154 |
+
default="predict",
|
| 155 |
+
help="Pipeline mode to run"
|
| 156 |
+
)
|
| 157 |
+
parser.add_argument(
|
| 158 |
+
"--period",
|
| 159 |
+
type=str,
|
| 160 |
+
default="2y",
|
| 161 |
+
help="Data period (1y, 2y, 5y)"
|
| 162 |
+
)
|
| 163 |
+
parser.add_argument(
|
| 164 |
+
"--epochs",
|
| 165 |
+
type=int,
|
| 166 |
+
default=100,
|
| 167 |
+
help="Training epochs"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
args = parser.parse_args()
|
| 171 |
+
|
| 172 |
+
if args.mode == "ingest":
|
| 173 |
+
run_data_ingestion(period=args.period)
|
| 174 |
+
elif args.mode == "train":
|
| 175 |
+
run_training(epochs=args.epochs)
|
| 176 |
+
elif args.mode == "predict":
|
| 177 |
+
run_prediction()
|
| 178 |
+
elif args.mode == "full":
|
| 179 |
+
run_full_pipeline()
|
models/currency-volatility-prediction/pyproject.toml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "modelx-final-models"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"certifi>=2025.11.12",
|
| 9 |
+
"dagshub>=0.6.3",
|
| 10 |
+
"fastapi>=0.122.0",
|
| 11 |
+
"mlflow>=3.6.0",
|
| 12 |
+
"numpy>=2.3.5",
|
| 13 |
+
"pandas>=2.3.3",
|
| 14 |
+
"pyaml>=25.7.0",
|
| 15 |
+
"pymongo[srv]>=4.15.4",
|
| 16 |
+
"python-dotenv>=1.2.1",
|
| 17 |
+
"python-multipart>=0.0.20",
|
| 18 |
+
"scikit-learn>=1.7.2",
|
| 19 |
+
"uvicorn>=0.38.0",
|
| 20 |
+
]
|
models/currency-volatility-prediction/requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-dotenv
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
pymongo
|
| 5 |
+
certifi
|
| 6 |
+
pymongo[srv]
|
| 7 |
+
scikit-learn
|
| 8 |
+
mlflow
|
| 9 |
+
pyaml
|
| 10 |
+
dagshub
|
| 11 |
+
fastapi
|
| 12 |
+
uvicorn
|
| 13 |
+
python-multipart
|
| 14 |
+
|
| 15 |
+
#-e .
|
models/currency-volatility-prediction/setup.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
'''
|
| 3 |
+
The setup.py file is an essential part of packaging and
|
| 4 |
+
distributing Python projects. It is used by setuptools
|
| 5 |
+
(or for distutils in older Python versions) to define the configuration
|
| 6 |
+
of your project, such as its metadata, dependencies, and more
|
| 7 |
+
'''
|
| 8 |
+
|
| 9 |
+
from setuptools import find_packages, setup
|
| 10 |
+
# this scans through all the folders and gets the folders that has the __init__ file
|
| 11 |
+
# setup is reponsible of providing all the information about the project
|
| 12 |
+
|
| 13 |
+
from typing import List
|
| 14 |
+
|
| 15 |
+
def get_requirements()->List[str]:
|
| 16 |
+
"""
|
| 17 |
+
This function will return a list of requirements
|
| 18 |
+
"""
|
| 19 |
+
requirement_lst:List[str]=[]
|
| 20 |
+
try:
|
| 21 |
+
with open('requirements.txt', 'r') as file:
|
| 22 |
+
# Read lines from the file
|
| 23 |
+
lines=file.readlines()
|
| 24 |
+
## Process each line
|
| 25 |
+
for line in lines:
|
| 26 |
+
requirement=line.strip()
|
| 27 |
+
## Ignore empty lines and -e .
|
| 28 |
+
|
| 29 |
+
if requirement and requirement != '-e .':
|
| 30 |
+
requirement_lst.append(requirement)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
except FileNotFoundError:
|
| 35 |
+
print("requirements.txt file not found")
|
| 36 |
+
|
| 37 |
+
return requirement_lst
|
| 38 |
+
|
| 39 |
+
setup(
|
| 40 |
+
name="NetworkSecurity",
|
| 41 |
+
version="0.0.1", # This can be changed later
|
| 42 |
+
author="Nivakaran S.",
|
| 43 |
+
author_email="nivakaran@hotmail.com",
|
| 44 |
+
packages=find_packages(),
|
| 45 |
+
install_requires=get_requirements()
|
| 46 |
+
)
|
| 47 |
+
|
models/currency-volatility-prediction/src/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
LOG_FILE=f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
|
| 6 |
+
|
| 7 |
+
logs_path=os.path.join(os.getcwd(), "logs", LOG_FILE)
|
| 8 |
+
|
| 9 |
+
os.makedirs(logs_path, exist_ok=True)
|
| 10 |
+
# Create the file only if it is not created
|
| 11 |
+
|
| 12 |
+
LOG_FILE_PATH=os.path.join(logs_path, LOG_FILE)
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
filename=LOG_FILE_PATH,
|
| 16 |
+
format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",
|
| 17 |
+
level=logging.INFO
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
models/currency-volatility-prediction/src/components/__init__.py
ADDED
|
File without changes
|