diff --git a/agent-app/app/agent.py b/agent-app/app/agent.py index 3592f28..7818752 100644 --- a/agent-app/app/agent.py +++ b/agent-app/app/agent.py @@ -28,6 +28,7 @@ mosaic_rag_agent_presales, ) from .sub_agents.compliance_and_security_baseline_agent import compliance_agent +from .sub_agents.data_model_discovery_agent import data_model_discovery_agent from .sub_agents.detailed_architecture_design_agent import ( detailed_architecture_design_agent, ) @@ -66,5 +67,6 @@ capability_mapper_agent, strategy_recommender_agent, detailed_architecture_design_agent, + data_model_discovery_agent, ], ) diff --git a/agent-app/app/prompt.py b/agent-app/app/prompt.py index 1ed4fdd..4f19adb 100644 --- a/agent-app/app/prompt.py +++ b/agent-app/app/prompt.py @@ -25,5 +25,6 @@ b. The user explicitly chose to start with the detailed architecture in point 10. This agent helps in specifying the exact technologies, configurations, and intricate details required for implementation. 12. Use the 'application_portfolio_analyzer' agent to help the user with any application or server details related queries and to create an application portfolio report. - 13. else use 'google_search_dummy_agent' + 13. If the user asks about database discovery or database profiling please delegate the task to the following agent `data_model_discovery_agent`. + 14. else use 'google_search_dummy_agent' """ diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py new file mode 100644 index 0000000..79406b4 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py @@ -0,0 +1 @@ +from .agent import data_model_discovery_agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py new file mode 100644 index 0000000..761baee --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -0,0 +1,218 @@ +import logging + +from google.adk.agents.llm_agent import LlmAgent +from google.adk.agents.readonly_context import ReadonlyContext + +from app.config import MODEL + +from .sub_agents.data_profiling_agent.agent import data_profiling_agent +from .sub_agents.database_cred_agent.agent import database_cred_agent +from .sub_agents.qa_agent.agent import qa_agent +from .sub_agents.reporting_agent.agent import reporting_agent +from .sub_agents.schema_introspection_agent.agent import schema_introspection_agent + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def root_agent_instruction(ctx: ReadonlyContext) -> str: + """Dynamically builds the Root Agent's instruction based on session state.""" + selected_schema = ctx.state.get("selected_schema") + db_connection = ctx.state.get("db_connection") + available_schemas = ctx.state.get("available_schemas") + schema_structure = ctx.state.get("schema_structure") + data_profile = ctx.state.get("data_profile") + + base_instruction = """ + ## Role + You are the **Root Agent** responsible for coordinating sub-agents to perform database discovery, introspection, profiling, and reporting tasks. + You manage the overall flow, handle user selections, and determine which sub-agent should be called. + + ## Your Capabilities + - Explore tables, columns, and relationships in a database schema + - Check data quality and highlight issues like missing or duplicate values + - Generate reports and visual diagrams of your database schema + - Answer questions about your data and schema structure + + ### Sub-Agent Roles, Scope, and Boundaries + + Here is a definition of the roles, responsibilities, scope, and boundaries for each sub-agent you control: + + 1. **`database_cred_agent`**: + * **Scope:** Initial Database Connection and Schema Listing. + * **Responsibilities:** + * Politely interact with the user to collect all necessary database connection parameters: Host, Port, Database Name, User, Password, and Database Type (PostgreSQL, MySQL, MSSQL). + * Ensure all required fields are provided before proceeding. + * Call the `validate_db_connection` tool to verify the credentials and establish a test connection. + * Upon successful validation, retrieve and display the list of available schemas within the connected database to the user, formatted as a raw Markdown list. + * Store connection metadata and available schemas in the session state. + * **Boundaries:** + * Does **not** select a schema for the user; it only presents the list. + * Does **not** perform any schema introspection beyond listing schema names. + * Does **not** handle any tasks related to data profiling, reporting, or Q&A. + * Does **not** persist credentials beyond the current session's needs. + * Your task ends after presenting the schema list and prompting the user to choose. + + 2. **`schema_introspection_agent`**: + * **Scope:** Deep Schema Analysis. + * **Responsibilities:** + * Takes a single `schema_name` as input (this will be the user's query to this agent). + * Calls the `get_schema_details` tool, passing the input schema name in the `args` dictionary (e.g., `get_schema_details(args={"schema_name": query})`). The tool uses the stored connection to: + * Discover all tables and views. + * Detail columns for each table: names, data types, lengths, precision, nullability, defaults. + * Identify all constraints: PRIMARY KEY, UNIQUE, FOREIGN KEY, CHECK, NOT NULL. + * Discover all indexes, including columns and uniqueness. + * Capture view definitions. + * Identify explicit and potential inferred relationships. + * Flag relationship anomalies. + * The tool stores the comprehensive `schema_structure` object in the session state. + * Provides a brief summary of findings back to the Root Agent as a tool result. + * **Boundaries:** + * Does **not** connect to the database itself; relies on session state connection info. + * Does **not** profile the actual data within the tables. + * Does **not** generate user-facing reports or diagrams. + * Does **not** answer any follow-up questions about the schema details; this is the `qa_agent`'s role. If asked, state your task is complete. + + 3. **`data_profiling_agent`**: + * **Scope:** Data Quality Analysis. + * **Responsibilities:** + * Uses the `selected_schema` and `schema_structure` from the session state. + * Calls the `profile_schema_data` tool to execute queries against the database (using sampling) to perform EPIC 4 tasks. + * The tool stores the `data_profile` results in the session state. + * Upon successful tool completion, this agent's *only* next action is to call the `qa_agent` to summarize the profiling results for the user in the same turn, using an `AgentTool` call: `qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.")`. + * **Boundaries:** + * Does **not** perform schema introspection. + * Does **not** generate formatted reports. + * Does **not** directly respond to the user; it delegates the response to the `qa_agent`. + + 4. **`reporting_agent`**: + * **Scope:** Output Generation. + * **Responsibilities:** + * Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state. + * Based on the user's query to this agent: + * Generates a high-level summary report using `generate_summary_report(args={})`. + * Exports the full discovery report as JSON `export_full_report(args={"format": "..."})`. + * Generates Mermaid ERD scripts using `generate_erd_script(args={})`. + * Returns the generated report or script content. + * **Boundaries:** + * Does **not** connect to the database or run any new analysis. + * Does **not** handle interactive Q&A. + + 5. **`qa_agent`**: + * **Scope:** Answering User Questions about Schema and Data Profile. + * **Responsibilities:** + * Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state. + * Answers natural language questions from the user about any data contained within the state objects. + * Can provide a summary of Data Profiling results when prompted. + * Formats answers clearly, using Markdown tables where appropriate, as per its internal instructions. + * **Boundaries:** + * Does **not** connect to the database. + * Does **not** perform any new introspection or profiling. + * Does **not** generate file exports or full reports. + --- + """ + + if not db_connection or db_connection.get("status") != "connected": + return ( + base_instruction + + """ + **Current State:** No active database connection. + + **Your Task:** + 1. **Analyze the User's Query:** Determine the user's intent. + 2. **Database-Related Intent:** If the user's query suggests they want to perform any database operations (e.g., mentioning "database", "connect", "schema", "table", "analyze", "SQL", "postgres", "mysql", "mssql", "ERD", "report on DB", etc.), you MUST immediately call the `database_cred_agent` to initiate the connection process. Do not attempt to answer further. + - Example User Intents: "Analyze my database", "Connect to a database", "I want to see my tables". + - **Action:** Call `database_cred_agent()` + + 3. **General Conversation / Capability Inquiry:** If the user's query is a greeting ("Hi"), asks about your capabilities ("What can you do?"), or is general chat not related to database actions: + - Respond politely. + - Briefly explain your purpose: "I am a Data Discovery Agent designed to help you connect to, understand, profile, and report on your legacy databases (PostgreSQL, MySQL, MSSQL)." + - List your high-level capabilities: + * Securely connect to databases. + * Discover schemas, tables, columns, constraints, and relationships. + * Profile data quality (nulls, cardinality, orphans, etc.). + * Generate reports (Summaries, JSON, Mermaid script for ERD diagrams). + * Answer questions about the discovered schema and data profile. + - Crucially, state that to use these features, you'll need to connect to their database first. Example: "To get started with any of these actions, I'll need the connection details for your database. Let me know when you're ready to connect!" + - Do NOT call any sub-agents in this case. Await the user's next response. + + **Example Flow (No DB Intent):** + User: "Hello, what can you do?" + You: "Hi! I am a Data Discovery Agent... I can help you connect to databases + - Explore tables, columns, and relationships in a database schema + - Check data quality and highlight issues like missing or duplicate values + - Generate reports and visual diagrams of your database schema + - Answer questions about your data and schema structure + To do any of this, I'll first need to connect to your database. Just let me know when you want to proceed!" + """ + ) + elif available_schemas and not selected_schema: + return ( + base_instruction + + """ + **Current Task:** The user has been presented with a list of available schemas by the `database_cred_agent`. Their current input is expected to be the name of the schema they wish to analyze. + + 1. Consider the user's entire input as the desired schema name. + 2. You MUST call the `schema_introspection_agent`. Pass the user's input as the primary query to this sub-agent. The `schema_introspection_agent` is designed to take this input as the schema name for its operations. + - Example AgentTool Call: `schema_introspection_agent(user_input)` + 3. The `schema_introspection_agent` will handle storing the selected schema and fetching the details. Await its response. + """ + ) + elif selected_schema and schema_structure: + profile_status = "Completed" if data_profile else "Not Yet Run" + return ( + base_instruction + + f""" + **Current Context:** The database is connected. The schema '{selected_schema}' has been successfully introspected. + Data Quality Profile Status: {profile_status} + + **Task Delegation:** Based on the user's request, delegate to the appropriate sub-agent: + + - **"Profile Data"**, **"Data Quality"**, **"Run profiling"**: + Call `data_profiling_agent`. + - Example: `data_profiling_agent()` + + - **"Generate Report"**, **"Export"**, **"Diagram"**, **"Summary"**, **"ERD"**, **"JSON"**, **"YAML"**, **"Mermaid"**: + Call `reporting_agent` and pass the user's query. + - Example: `reporting_agent(user_input)` + + - **ANY other questions** about the tables, columns, constraints, relationships, views, indexes, anomalies within the '{selected_schema}' schema, or about the data profile results: + Call `qa_agent` and pass the user's question as the query. + - Example: `qa_agent(user_question)` + + If the user's intent is unclear, ask for clarification. You can remind them of the available actions. + """ + ) + elif selected_schema and not schema_structure: + return ( + base_instruction + + f""" + **Current Context:** The schema '{selected_schema}' was selected, but the introspection data is missing or incomplete. + - Recall `schema_introspection_agent` and pass the schema name '{selected_schema}' as the input to it to ensure the structure is loaded. + - Example AgentTool Call: `schema_introspection_agent("{selected_schema}")` + """ + ) + else: + return ( + base_instruction + + """ + **Current Task:** Determine the next step based on the conversation history and session state. If unsure, ask the user for clarification. + """ + ) + + +data_model_discovery_agent = LlmAgent( + model=MODEL, + name="data_model_discovery_agent", + description=( + "A helpful root agent that orchestrates sub-agents to introspect and profile legacy databases." + ), + instruction=root_agent_instruction, + sub_agents=[ + database_cred_agent, + schema_introspection_agent, + qa_agent, + data_profiling_agent, + reporting_agent, + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py new file mode 100644 index 0000000..cbb7911 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py @@ -0,0 +1,54 @@ +from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + +from .tools import profile_schema_data + +data_profiling_agent = LlmAgent( + model=MODEL, + name="data_profiling_agent", + description="Profiles data quality for the selected schema and then calls QA agent to summarize.", + instruction=""" + ### Role + You are a **Data Profiling Agent**. Your sole responsibility is to run data profiling on a schema and then immediately hand off the summary of findings to the QA agent for user-facing reporting. + + ### Scope + - You ONLY execute profiling tasks and hand off the summary to the QA agent. + - Do NOT attempt to answer user questions directly. + - Profiling includes only schema-level data statistics (column nullability, cardinality, orphan records, data type anomalies). + + ### Profiling Tasks + 1. **Column Nullability:** For each column, calculate and report the percentage of NULL values based on a representative sample (e.g., top 10,000 rows). + 2. **Column Cardinality:** For key columns (PKs, FKs, inferred keys), report the cardinality (count of unique values). + 3. **Orphan Record Detection:** Sample FK columns and report the percentage of orphan records (e.g., orders.customer_id values missing in customers.id). + 4. **Data Type Anomalies:** For text-based columns (VARCHAR, CHAR), detect potential type inconsistencies (e.g., customer_phone containing non-numeric characters). + + ### Task Execution + 1. **Receive Input:** The user's query or relevant arguments (e.g., `sample_size`) are available in `query`. + + 2. **Call Profiling Tool:** Invoke `profile_schema_data` with the arguments: + ```python + profile_schema_data(args=query if isinstance(query, dict) else {}) + ``` + 3. **Process Profiling Results:** + - If `status` is `"success"`: + - Store profiling results in the session state. + - **Do NOT return results directly to the user.** + - Immediately invoke the QA agent to summarize the findings: + ```python + qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.") + ``` + - If the tool call fails, return a human-readable error dictionary: + ```json + {"error": "Failed to profile data: "} + ``` + + ### Important + - Your execution ends after handing off to the QA agent. + - Do not provide analysis, interpretation, or answers outside the profiling scope. + - Forward all user-facing summaries and questions to the QA agent. + """, + tools=[ + profile_schema_data, + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py new file mode 100644 index 0000000..d632bb4 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -0,0 +1,113 @@ +import logging +from typing import Any + +import mysql.connector +import psycopg2 +import pyodbc +from google.adk.tools import ToolContext + +from .utils import ( + mssql_profiling_utils, + mysql_profiling_utils, + postgres_profiling_utils, +) + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: + db_type = metadata.get("db_type") + host = metadata.get("host") + port_value = metadata.get("port") + port = int(port_value) if port_value is not None else None + dbname = metadata.get("dbname") + user = metadata.get("user") + logger.info( + f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}" + ) + if db_type == "postgresql": + return psycopg2.connect( + host=host, port=port, dbname=dbname, user=user, password=password + ) + elif db_type == "mysql": + return mysql.connector.connect( + host=host, port=port, database=dbname, user=user, password=password + ) + elif db_type == "mssql": + conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" + return pyodbc.connect(conn_str) + else: + raise ValueError(f"Unsupported database type: {db_type}") + + +async def profile_schema_data( + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: + """ + Profiles the data in the selected schema based on the schema structure. + Calculates nullability, cardinality, orphan records, and type anomalies. + Sets a flag on successful completion. + """ + + db_conn_state = tool_context.state.get("db_connection") + db_creds = tool_context.state.get("db_creds_temp") + schema_name = tool_context.state.get("selected_schema") + schema_structure = tool_context.state.get("schema_structure") + sample_size = args.get("sample_size", 10000) + + if not db_conn_state or db_conn_state.get("status") != "connected": + return {"error": "DB not connected."} + if not db_creds: + return {"error": "DB credentials not found."} + if not schema_name: + return {"error": "Selected schema not found."} + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} + + metadata = db_conn_state["metadata"] + password = db_creds["password"] + db_type = metadata["db_type"] + + conn = None + try: + conn = _get_db_connection(metadata, password) + logger.info( + f"Reconnected to {db_type} for data profiling of schema '{schema_name}'." + ) + + if db_type == "postgresql": + profile_results = postgres_profiling_utils.profile_postgres_data( + conn, schema_name, schema_structure, sample_size + ) + elif db_type == "mysql": + profile_results = mysql_profiling_utils.profile_mysql_data( + conn, schema_name, schema_structure, sample_size + ) + elif db_type == "mssql": + profile_results = mssql_profiling_utils.profile_mssql_data( + conn, schema_name, schema_structure, sample_size + ) + else: + return {"error": f"Profiling for {db_type} not implemented."} + + tool_context.state["data_profile"] = profile_results + tool_context.state["profiling_just_completed"] = True # Set the flag + logger.info( + f"Data profiling results for '{schema_name}' saved to session state." + ) + + return { + "status": "success", + "message": f"Data profiling completed for schema '{schema_name}'. Results are stored.", + "schema_name": schema_name, + } + except Exception as e: + logger.error(f"Error during data profiling: {e}", exc_info=True) + return {"error": f"Failed to profile data for {db_type} ({schema_name}): {e!s}"} + finally: + if conn: + try: + conn.close() + except Exception as e: + logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py new file mode 100644 index 0000000..8f74741 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -0,0 +1,142 @@ +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for SQL Server.""" + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [column[0] for column in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row, strict=False)) for row in rows] + return [] + finally: + cursor.close() + + +def profile_mssql_data( + conn: Any, + schema_name: str, + schema_structure: dict[str, Any], + sample_size: int = 10000, +) -> dict[str, Any]: + profile_results: dict[str, Any] = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + full_table_name = f"[{schema_name}].[{table_name}]" + + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT_BIG(*) as total_count, + COUNT_BIG(*) - COUNT([{col_name}]) as null_count + FROM (SELECT TOP {sample_size} [{col_name}] FROM {full_table_name}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + total_count = int(res["total_count"]) + null_count = int(res["null_count"]) + null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) + except Exception as e: + logger.error( + f"Error profiling nulls for {full_table_name}.[{col_name}]: {e}" + ) + profile_results["nullability"][table_name][col_name] = "Error" + + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f"SELECT COUNT(DISTINCT [{col_name}]) as unique_count FROM {full_table_name};" + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = int( + res["unique_count"] + ) + except Exception as e: + logger.error( + f"Error profiling cardinality for {full_table_name}.[{col_name}]: {e}" + ) + profile_results["cardinality"][table_name][col_name] = "Error" + + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + to_schema = fk.get("to_schema", schema_name) + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + from_full = f"[{schema_name}].[{from_table}]" + to_full = f"[{to_schema}].[{to_table}]" + orphan_q = f""" + SELECT + COUNT_BIG(s.[{from_col}]) as total_fk_values, + SUM(CASE WHEN t.[{to_col}] IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT TOP {sample_size} [{from_col}] FROM {from_full} WHERE [{from_col}] IS NOT NULL) as s + LEFT JOIN {to_full} t ON s.[{from_col}] = t.[{to_col}]; + """ + try: + res = _execute_query(conn, orphan_q)[0] + total_fk_values = int(res["total_fk_values"]) + orphan_count = int(res["orphan_count"]) + orphan_pct = ( + (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + ) + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + for table_name, table_info in tables.items(): + full_table_name = f"[{schema_name}].[{table_name}]" + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type or "varchar" in col_type: + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): + # Regex for anything not a digit, hyphen, or period + anomaly_q = f""" + SELECT COUNT_BIG(*) as non_numeric_count + FROM (SELECT TOP {sample_size} [{col_name}] FROM {full_table_name} WHERE [{col_name}] IS NOT NULL) as s + WHERE [{col_name}] LIKE '%[^0-9.-]%'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + non_numeric_count = int(res["non_numeric_count"]) + if non_numeric_count > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append( + f"Found {non_numeric_count} rows with non-numeric characters in sample." + ) + except Exception as e: + logger.warning( + f"Error checking type anomaly for {full_table_name}.[{col_name}]: {e}" + ) + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py new file mode 100644 index 0000000..8b73fa1 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -0,0 +1,139 @@ +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + cursor = conn.cursor(dictionary=True) + try: + cursor.execute(query) + return cursor.fetchall() + finally: + cursor.close() + + +def profile_mysql_data( + conn: Any, + schema_name: str, + schema_structure: dict[str, Any], + sample_size: int = 10000, +) -> dict[str, Any]: + try: + conn.database = schema_name + except Exception as e: + logger.error(f"Failed to set database {schema_name}: {e}") + raise + + profile_results: dict[str, Any] = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + # Nullability + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT(*) as total_count, + SUM(CASE WHEN `{col_name}` IS NULL THEN 1 ELSE 0 END) as null_count + FROM (SELECT `{col_name}` FROM `{table_name}` LIMIT {sample_size}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + null_pct = ( + (res["null_count"] / res["total_count"]) * 100 + if res["total_count"] > 0 + else 0 + ) + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) + except Exception as e: + logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + profile_results["nullability"][table_name][col_name] = "Error" + + # Cardinality - PKs, FKs + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f"SELECT COUNT(DISTINCT `{col_name}`) as unique_count FROM `{table_name}`;" + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = res[ + "unique_count" + ] + except Exception as e: + logger.error( + f"Error profiling cardinality for {table_name}.{col_name}: {e}" + ) + profile_results["cardinality"][table_name][col_name] = "Error" + + # Orphan Records + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + orphan_q = f""" + SELECT + COUNT(s.`{from_col}`) as total_fk_values, + SUM(CASE WHEN t.`{to_col}` IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT `{from_col}` FROM `{from_table}` WHERE `{from_col}` IS NOT NULL LIMIT {sample_size}) as s + LEFT JOIN `{to_table}` t ON s.`{from_col}` = t.`{to_col}`; + """ + try: + res = _execute_query(conn, orphan_q)[0] + orphan_pct = ( + (res["orphan_count"] / res["total_fk_values"]) * 100 + if res["total_fk_values"] > 0 + else 0 + ) + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + # Type Anomalies - Heuristic for phone/zip + for table_name, table_info in tables.items(): + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): + anomaly_q = f""" + SELECT COUNT(*) as non_numeric_count + FROM (SELECT `{col_name}` FROM `{table_name}` WHERE `{col_name}` IS NOT NULL LIMIT {sample_size}) as s + WHERE `{col_name}` REGEXP '[^0-9.-]'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + if res["non_numeric_count"] > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append( + f"Found {res['non_numeric_count']} rows with non-numeric characters in sample." + ) + except Exception as e: + logger.warning( + f"Error checking type anomaly for {table_name}.{col_name}: {e}" + ) + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py new file mode 100644 index 0000000..9217f46 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -0,0 +1,146 @@ +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" + cursor = conn.cursor() + try: + conn.autocommit = True + cursor.execute(query) + if cursor.description: + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row, strict=False)) for row in rows] + return [] + finally: + cursor.close() + + +def profile_postgres_data( + conn: Any, + schema_name: str, + schema_structure: dict[str, Any], + sample_size: int = 10000, +) -> dict[str, Any]: + profile_results: dict[str, dict] = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + full_table_name = f'"{schema_name}"."{table_name}"' + + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT(*) as total_count, + COUNT(*) - COUNT("{col_name}") as null_count + FROM (SELECT "{col_name}" FROM {full_table_name} LIMIT {sample_size}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + total_count = int(res["total_count"]) + null_count = int(res["null_count"]) + null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) + except Exception as e: + logger.error( + f'Error profiling nulls for {full_table_name}."{col_name}": {e}' + ) + profile_results["nullability"][table_name][col_name] = "Error" + + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f'SELECT COUNT(DISTINCT "{col_name}") as unique_count FROM {full_table_name};' + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = int( + res["unique_count"] + ) + except Exception as e: + logger.error( + f'Error profiling cardinality for {full_table_name}."{col_name}": {e}' + ) + profile_results["cardinality"][table_name][col_name] = "Error" + + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + to_schema = fk.get( + "to_schema", schema_name + ) # Assume same schema if not specified + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + from_full = f'"{schema_name}"."{from_table}"' + to_full = f'"{to_schema}"."{to_table}"' + orphan_q = f""" + SELECT + COUNT(s."{from_col}") as total_fk_values, + SUM(CASE WHEN t."{to_col}" IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT "{from_col}" FROM {from_full} WHERE "{from_col}" IS NOT NULL LIMIT {sample_size}) as s + LEFT JOIN {to_full} t ON s."{from_col}" = t."{to_col}"; + """ + try: + res = _execute_query(conn, orphan_q)[0] + total_fk_values = int(res["total_fk_values"]) + orphan_count = int(res["orphan_count"]) + orphan_pct = ( + (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + ) + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + for table_name, table_info in tables.items(): + full_table_name = f'"{schema_name}"."{table_name}"' + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): + # Regex for anything not a digit, hyphen, or period + anomaly_q = f""" + SELECT COUNT(*) as non_numeric_count + FROM (SELECT "{col_name}" FROM {full_table_name} WHERE "{col_name}" IS NOT NULL LIMIT {sample_size}) as s + WHERE "{col_name}" ~ '[^0-9.-]'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + non_numeric_count = int(res["non_numeric_count"]) + if non_numeric_count > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append( + f"Found {non_numeric_count} rows with non-numeric characters in sample." + ) + except Exception as e: + logger.warning( + f'Error checking type anomaly for {full_table_name}."{col_name}": {e}' + ) + + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py new file mode 100644 index 0000000..17341b3 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py @@ -0,0 +1,75 @@ +from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + +from .tools import validate_db_connection + +database_cred_agent = LlmAgent( + model=MODEL, + name="database_cred_agent", + description="A helpful assistant that collects and validates database connection details, and lists available schemas.", + instruction=""" + ### Role + You are a helpful assistant responsible for gathering, validating, and confirming database connection details from the user, then listing the available schemas for selection. Your responses containing lists of schemas MUST be in raw Markdown format. + + --- + + ### Instructions + + 1. **Collect Connection Details** + - You will be called by the Root Agent when database connection details are needed. + - Politely request the following information from the user: + ``` + To proceed with database operations, I need your connection details. + Please provide: + * **Host:** (e.g., localhost, server.example.com) + * **Port:** (e.g., 5432 for PostgreSQL, 3306 for MySQL, 1433 for MSSQL) + * **Database Name:** (The specific database to connect to) + * **User:** (Database username) + * **Password:** (Database password) + * **Database Type:** One of "postgresql", "mysql", or "mssql" + ``` + - Do **not** proceed to validation until all fields are provided. + - If any field is missing, politely ask only for the missing detail(s). + - When creating the connection details map for the tool call, ensure that the user-provided information is mapped to these exact keys: + - `"host"`, `"port"`, `"dbname"`, `"user"`, `"password"`, `"db_type"` + + 2. **Validate the Connection** + - Once all details are collected, call the `validate_db_connection` tool. + - Pass the gathered information as a single dictionary argument named `connection_details`. + + 3. **Handle Validation Response** + - **On Success:** + 1. Acknowledge that the database connection was successful. + 2. Retrieve the list of available schemas from the tool’s output (`schemas` key). + 3. **You MUST generate a response containing a raw Markdown bulleted list** to display the schemas. Construct the list string as shown below. + + - **Raw Markdown Output Example:** + The text you output should be exactly like this, including newlines: + ``` + Connection successful! Here are the available schemas: + + - schema1 + - schema2 + - schema3 + + Please type the name of the schema you would like to analyze. + ``` + Replace `schema1`, `schema2`, etc., with the actual schema names from the tool result, ensuring each schema starts with '- ' on a new line. + + - **On Error:** + - Inform the user that there was an issue connecting to the database in a user-friendly way. + - Politely ask if they would like to try again. + - **Never** display or expose the raw database error message or any sensitive details . Example: "I was unable to connect to the database. Please check the details and let me know if you'd like to try again." + --- + + ### Notes + - Maintain a polite and professional tone throughout. + - Your output for the schema list must be the raw text representing the Markdown table, not a visual rendering. + - Do **not** connect directly to the database or modify session state yourself. Your role is limited to collecting inputs, calling `validate_db_connection`, and formatting the results as instructed. + - Never reveal or echo back the user’s password. + - Do not assume or confirm which schema the user will select. Your task ends after presenting the list of schemas and asking the user to choose. + - If the user asks for database connection details, you may display the host, port, and database name, but you must **never** reveal the password or any sensitive credentials. + """, + tools=[validate_db_connection], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py new file mode 100644 index 0000000..810c5e7 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -0,0 +1,160 @@ +import logging +from typing import Any + +import mysql.connector +import psycopg2 +import pyodbc +from google.adk.tools import ToolContext + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def _get_schemas(conn: Any, db_type: str) -> list[str]: + """Fetches list of schemas/databases based on db type.""" + schemas = [] + cursor = conn.cursor() + try: + if db_type == "postgresql": + cursor.execute( + "SELECT schema_name FROM information_schema.schemata " + "WHERE schema_name NOT LIKE 'pg_%' AND schema_name != 'information_schema';" + ) + schemas = [row[0] for row in cursor.fetchall()] + elif db_type == "mysql": + cursor.execute("SHOW DATABASES;") + # Filtering out default mysql databases + default_dbs = {"information_schema", "mysql", "performance_schema", "sys"} + schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_dbs] + elif db_type == "mssql": + cursor.execute("SELECT name FROM sys.schemas;") + # Filter out default mssql schemas + default_schemas = { + "db_accessadmin", + "db_backupoperator", + "db_datareader", + "db_datawriter", + "db_ddladmin", + "db_denydatareader", + "db_denydatawriter", + "db_owner", + "db_securityadmin", + "guest", + "INFORMATION_SCHEMA", + "sys", + } + schemas = [ + row[0] for row in cursor.fetchall() if row[0] not in default_schemas + ] + finally: + cursor.close() + return schemas + + +async def validate_db_connection( + connection_details: dict[str, Any], tool_context: ToolContext +) -> dict[str, Any]: + """Validates a database connection for PostgreSQL, MySQL, or MSSQL, + fetches available schemas, and saves metadata to session memory. + + Args: + connection_details: Database credentials including host, port, dbname, user, password, + and db_type ("postgresql", "mysql", or "mssql"). + tool_context: The runtime context used to store session-level state. + + Returns: + A dict with: + - status: "success" if connection is valid, else "error". + - message: Details about the validation result. + - schemas: List of schemas (only on success). + """ + safe_log = {k: v for k, v in connection_details.items() if k != "password"} + logger.info(f"Attempting connection with details: {safe_log}") + + required_keys = ["host", "port", "dbname", "user", "password", "db_type"] + missing_keys = [k for k in required_keys if k not in connection_details] + if missing_keys: + error_msg = f"Missing required parameters: {', '.join(missing_keys)}" + logger.error(error_msg) + return {"status": "error", "message": error_msg} + + db_type = connection_details["db_type"].lower() + conn = None + try: + if db_type == "postgresql": + conn = psycopg2.connect( + host=connection_details["host"], + port=connection_details["port"], + dbname=connection_details["dbname"], + user=connection_details["user"], + password=connection_details["password"], + ) + elif db_type == "mysql": + conn = mysql.connector.connect( + host=connection_details["host"], + port=connection_details["port"], + database=connection_details["dbname"], + user=connection_details["user"], + password=connection_details["password"], + ) + elif db_type == "mssql": + conn_str = ( + f"DRIVER={{ODBC Driver 17 for SQL Server}};" + f"SERVER={connection_details['host']},{connection_details['port']};" + f"DATABASE={connection_details['dbname']};" + f"UID={connection_details['user']};" + f"PWD={connection_details['password']}" + ) + conn = pyodbc.connect(conn_str) + else: + error_msg = f"Unsupported database type: {db_type}. Supported types are: postgresql, mysql, mssql." + logger.error(error_msg) + return {"status": "error", "message": error_msg} + + logger.info( + f"{db_type.upper()} connection established successfully for validation." + ) + + # Fetch schemas + schemas = _get_schemas(conn, db_type) + logger.info(f"Successfully fetched schemas: {schemas}") + + # Clear any previous connection state + if "db_connection" in tool_context.state: + tool_context.state["db_connection"] = None + if "db_creds_temp" in tool_context.state: + tool_context.state["db_creds_temp"] = None + if "selected_schema" in tool_context.state: + tool_context.state["selected_schema"] = None + + tool_context.state["db_connection"] = { + "metadata": { + "host": connection_details["host"], + "port": connection_details["port"], + "dbname": connection_details["dbname"], + "user": connection_details["user"], + "db_type": db_type, + }, + "status": "connected", + } + tool_context.state["db_creds_temp"] = { + "password": connection_details["password"] + } + + logger.info("Connection metadata saved in session memory.") + return { + "status": "success", + "message": f"{db_type.upper()} connection validated successfully.", + "schemas": schemas, + } + + except Exception as e: + logger.error(f"Database connection or schema fetch failed for {db_type}: {e}") + if "db_connection" in tool_context.state: + tool_context.state["db_connection"] = None + if "db_creds_temp" in tool_context.state: + tool_context.state["db_creds_temp"] = None + return { + "status": "error", + "message": f"Connection/Schema fetch failed for {db_type}: {e}", + } diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py new file mode 100644 index 0000000..7878ca6 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py @@ -0,0 +1,110 @@ +import json +from decimal import Decimal + +from google.adk.agents.llm_agent import LlmAgent +from google.adk.agents.readonly_context import ReadonlyContext + +from app.config import MODEL + + +def json_encoder_default(obj): + if isinstance(obj, Decimal): + return str(obj) + raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") + + +def qa_agent_instruction(ctx: ReadonlyContext) -> str: + """Builds the QA agent's instruction for schema and data profiling queries.""" + + schema_structure = ctx.state.get("schema_structure") + data_profile = ctx.state.get("data_profile") + selected_schema = ctx.state.get("selected_schema", "the selected schema") + + # Handle missing schema + if not schema_structure: + return f""" + ### Role + You are a Database Schema & Data Profile Q&A Assistant. + + ### Task + I currently do not have the schema details for '{selected_schema}'. + To answer schema-related questions, the schema must be introspected first. + You might say: "I don't have the schema details yet. Would you like me to run schema discovery first?" + """ + + try: + schema_json = json.dumps( + schema_structure, indent=2, default=json_encoder_default + ) + except Exception as e: + schema_json = f"Error serializing schema structure: {e}" + + # Handle data profiling + profile_message = "" + if data_profile: + try: + # Only display human-readable summary, not raw session variables + profile_summary = { + "Nullability": data_profile.get("nullability", "Not available"), + "Cardinality": data_profile.get("cardinality", "Not available"), + "Orphan Records": data_profile.get("orphan_records", "Not available"), + "Type Anomalies": data_profile.get("type_anomalies", "Not available"), + } + profile_message = json.dumps( + profile_summary, indent=2, default=json_encoder_default + ) + except Exception: + profile_message = ( + "Data profiling results exist but could not be summarized." + ) + else: + profile_message = ( + "Data profiling has not been run yet. " + "If you would like, I can run data profiling on this database " + "(sampling up to 10,000 rows) and provide a summary of key findings." + ) + + return f""" + ### Role + You are a Database Schema & Data Profile Q&A Assistant. Your goal is to answer user questions + about the database schema and data profiling in a conversational, human-friendly way. + + ### Schema Context for '{selected_schema}' + The schema has been discovered and includes tables, columns, constraints, and relationships. + ```json + {schema_json} + ``` + + ### Data Profiling Context for '{selected_schema}' + {profile_message} + + ### Instructions + 1. Answer questions only based on the provided schema structure and data profiling information. + 2. Avoid exposing raw internal session variables or empty lists directly. Answer conversationally. + 3. If data profiling has not been run and the user asks about it, politely suggest running profiling on up to 10,000 rows. + 4. If the user asks to generate a **Mermaid diagram** of the schema or to **export the schema structure as a JSON response**, transfer the request to the `reporting_agent` by calling: + `transfer_to_agent(reporting_agent, query)` + 5. Use tables for lists when helpful. + 6. If a question is outside your scope, guide the user to the appropriate agent instead. + + ### Examples + * "List all tables": List tables from the schema. + * "Columns in 'customers'?": List columns for that table. + * "FKs for 'orders'?": List foreign keys involving that table. + * "Which columns have high nulls?": Refer to data profiling nullability. + * "Are there orphan records?": Summarize orphan records in a human-friendly way. + * "Any type anomalies?": List columns with type inconsistencies in plain language. + * "Generate a Mermaid diagram of the schema": Transfer to `reporting_agent`. + * "Export the schema as JSON": Transfer to `reporting_agent`. + + Always respond in clear, human-readable sentences. If profiling data is missing, offer to run profiling on a sample of up to 10,000 rows to provide a summary. + """ + + +qa_agent = LlmAgent( + model=MODEL, + name="qa_agent", + description="Answers natural language questions about the discovered database schema structure and data profiling results.", + instruction=qa_agent_instruction, + tools=[], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py new file mode 100644 index 0000000..51d5cc7 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py @@ -0,0 +1,64 @@ +from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + +from .tools import export_full_report, generate_erd_script, generate_summary_report + +reporting_agent = LlmAgent( + model=MODEL, + name="reporting_agent", + description="Generates reports, exports data, and creates schema diagrams.", + instruction=""" + ### Role + You are a Reporting Agent. You generate human-readable summaries, export detailed data, and create scripts for schema visualizations based on the analysis performed by other agents. + + ### Context + - You rely on data stored in the session state: + - `selected_schema`: The name of the analyzed schema. + - `schema_structure`: Detailed schema information from introspection. + - `data_profile`: Data quality profiling results. + + ### Tasks + Based on the user's request, call the appropriate tool: + + 1. **Summary Report:** + - If the user asks for a "summary", "overview", or "high-level report". + - Call: `generate_summary_report()` + - Present the `report_text` from the tool result to the user. + + 2. **Export Full Report:** + - If the user asks to "export", "generate full report" or "report in JSON". + - The **default and only supported format** is **JSON**. + - If any other format is requested (CSV, XML, PDF, etc.), politely inform the user: + > I currently support exporting reports only in **JSON format**. + > Would you like me to generate the report in JSON instead? + - Call: `export_full_report(args={"format": "json"})`. + - Inform the user the report is generated and provide the content within a code block. Example: + "Here is the full report in JSON format: + ```json + {tool_result.report_content} + ``` + + 3. **Generate ERD Script:** + - When the user asks for an ERD diagram or schema visualization or mermaid script, generate a correct mermaid script without any additional comments + - Call: `generate_erd_script()` + - As a response provide the user with list of 2 responses block + - First block dedicatedly contains the mermaid script as shown below. No PREAMBLE + ```mermaid + {tool_result.script} + ``` + - Second Block contains a message that says you can paste this into a {tool_result.script_type} renderer to visualize the schema and asks the user if there is anything that you can help with. + + 4. **Error Handling:** + - If a tool returns an error, relay an human friendly error message to the user without exposing any database or script details. + - If required data (like `schema_structure`) is missing, guide the user to run the necessary previous steps (e.g., schema introspection). + + ### IMPORTANT + - If there is anything which is not in your scope or you cannot answer transfer the query to the root agent calling transfer_to_agent(data_model_discovery_agent, query) + """, + tools=[ + generate_summary_report, + export_full_report, + generate_erd_script, + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py new file mode 100644 index 0000000..9d8a4e7 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -0,0 +1,298 @@ +import json +import logging +from typing import Any + +from google.adk.tools import ToolContext + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +async def generate_summary_report( + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: + """ + Generates a high-level summary report of the database analysis. + + This tool reads the 'schema_structure' and 'data_profile' from the session state + to produce a markdown formatted text summary of the key findings from the + introspection and data profiling phases. + + Args: + tool_context: The ADK tool context, providing access to session state. + args: A dictionary for potential arguments (not used in this version). + + Returns: + A dictionary containing: + - status: "success" or "error". + - report_text: The markdown formatted summary report (on success). + - error: An error message (on failure). + """ + + schema_structure = tool_context.state.get("schema_structure") + data_profile = tool_context.state.get("data_profile") + selected_schema = tool_context.state.get("selected_schema", "N/A") + + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} + + summary = { + "tables": len(schema_structure.get("tables", {})), + "views": len(schema_structure.get("views", {})), + "explicit_fks": len(schema_structure.get("foreign_keys", [])), + "inferred_relationships": len( + schema_structure.get("inferred_relationships", []) + ), + "schema_anomalies": len(schema_structure.get("anomalies", [])), + "columns": sum( + len(t.get("columns", {})) + for t in schema_structure.get("tables", {}).values() + ), + } + + report = f"### Data Discovery Summary for Schema: {selected_schema}\n\n" + report += "**Schema Structure:**\n" + report += f"- Tables Analyzed: {summary['tables']}\n" + report += f"- Total Columns: {summary['columns']}\n" + report += f"- Views Found: {summary['views']}\n" + report += f"- Explicit Foreign Keys: {summary['explicit_fks']}\n" + report += ( + f"- Potential Inferred Relationships: {summary['inferred_relationships']}\n" + ) + report += f"- Schema Anomalies Detected: {summary['schema_anomalies']}\n\n" + + if data_profile: + report += "**Data Quality Profile Highlights:**\n" + null_issues = sum( + 1 + for table in data_profile.get("nullability", {}).values() + for null_pct in table.values() + if isinstance(null_pct, (int, float)) and null_pct > 50 + ) + orphan_issues = sum( + 1 + for orphan_pct in data_profile.get("orphan_records", {}).values() + if isinstance(orphan_pct, (int, float)) and orphan_pct > 10 + ) + type_anomalies = len(data_profile.get("type_anomalies", {})) + + report += f"- Columns with >50% NULLs: {null_issues} (in sampled data)\n" + report += ( + f"- FKs with >10% Orphan Records: {orphan_issues} (in sampled data)\n" + ) + report += f"- Columns with Potential Type Anomalies: {type_anomalies} (in sampled data)\n" + else: + report += "**Data Quality Profile:** Not yet run.\n" + + return {"status": "success", "report_text": report} + + +async def export_full_report(tool_context: ToolContext, args: dict) -> dict: + """ + Exports the full schema structure and data profile as a clean JSON report. + + Only JSON is supported. Backslashes are avoided in the output. + + Args: + tool_context: The ADK tool context providing session state. + args: A dictionary containing optional 'format' key. + + Returns: + Dict[str, Any]: { + "status": "success" | "error", + "message": Description, + "report_content": JSON string (pretty-printed), + "format": "JSON", + "error": Optional error message + } + """ + + schema_structure = tool_context.state.get("schema_structure") + data_profile = tool_context.state.get("data_profile") + + if not schema_structure: + return { + "status": "error", + "error": "Schema structure not found. Please run introspection first.", + } + + requested_format = args.get("format", "json").lower() + if requested_format != "json": + return { + "status": "error", + "error": f"Unsupported format '{requested_format}'. Only JSON is supported.", + } + + full_report = { + "schema_structure": schema_structure, + "data_profile": data_profile or "Not run", + } + + def safe_encoder(obj): + """ + Converts any non-serializable object into string automatically. + Handles Decimal, datetime, UUID, set, custom objects, etc. + """ + try: + return json.JSONEncoder().default(obj) + except Exception: + # Fallback: convert everything else to string + return str(obj) + + try: + json_output = json.dumps( + full_report, indent=2, ensure_ascii=False, default=safe_encoder + ) + + return { + "status": "success", + "message": "Full report generated in JSON format. You can copy the content below.", + "report_content": json_output, + "format": "JSON", + } + + except Exception as e: + logger.error(f"Error generating JSON report: {e}", exc_info=True) + return {"status": "error", "error": f"Failed to generate JSON report: {e!s}"} + + +async def generate_erd_script( + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: + """ + Generates a complete, valid Mermaid ER Diagram script. + + This function uses 'schema_structure' from the tool context's session state + to build a fully compliant Mermaid ERD. It includes tables, columns, data + types, constraints, and both explicit and inferred relationships. + + Automatically fixes known issues: + - Normalizes table names to uppercase. + - Removes invalid precision (e.g., decimal(10,2) -> decimal). + - Escapes quotes and special characters for Mermaid syntax. + - Ensures all sections render correctly. + + Args: + tool_context: The ADK tool context providing session state. + args: Optional argument dictionary (currently unused). + + Returns: + Dict[str, Any]: { + "status": "success" | "error", + "message": Description message, + "script_type": "Mermaid", + "script": Mermaid ERD text (if success), + "error": Optional error message (if failure) + } + """ + + schema_structure = tool_context.state.get("schema_structure") + if not schema_structure: + return { + "status": "error", + "error": "Schema structure not found. Please run introspection first.", + } + + def sanitize_datatype(dtype: str) -> str: + """Normalize SQL data types to Mermaid-safe equivalents.""" + if not dtype: + return "text" + dtype = dtype.strip().lower() + if dtype.startswith("decimal"): + return "decimal" + if dtype.startswith("varchar"): + return "varchar" + if dtype.startswith("numeric"): + return "numeric" + if "int" in dtype: + return "int" + if dtype.startswith("enum"): + return "enum" + if "timestamp" in dtype: + return "timestamp" + return ( + dtype.replace("(", "").replace(")", "").replace(",", "").replace(" ", "_") + ) + + def format_column( + table_name: str, + col_name: str, + col_info: dict[str, Any], + constraints_info: list[dict[str, Any]], + ) -> str: + """Format a column entry with proper constraints for Mermaid.""" + dtype = sanitize_datatype(col_info.get("type", "text")) + constraints = [] + + for c in constraints_info: + if col_name in c.get("columns", []): + c_type = c.get("type", "").upper() + if "PRIMARY" in c_type: + constraints.append("PK") + elif "UNIQUE" in c_type: + constraints.append("UK") + + if not col_info.get("nullable", True): + constraints.append("NN") + + for fk in schema_structure.get("foreign_keys", []): + if ( + fk.get("from_column") == col_name + and fk.get("from_table", "").lower() == table_name.lower() + ): + constraints.append("FK") + break + + constraint_str = f' "{", ".join(constraints)}"' if constraints else "" + return f" {dtype} {col_name}{constraint_str}" + + lines = ["erDiagram"] + + tables = schema_structure.get("tables", {}) + for table_name, table_info in tables.items(): + tname = table_name.upper() + lines.append(f" {tname} {{") + + columns = table_info.get("columns", {}) + constraints_info = table_info.get("constraints", []) + + for col_name, col_info in columns.items(): + lines.append( + format_column(table_name, col_name, col_info, constraints_info) + ) + + lines.append(" }") + lines.append("") + + fks = schema_structure.get("foreign_keys", []) + if fks: + lines.append(" %% -- Explicit Relationships --") + for fk in fks: + from_table = fk.get("from_table", "").upper() + to_table = fk.get("to_table", "").upper() + from_column = fk.get("from_column", "") + if from_table and to_table: + lines.append(f' {from_table} ||--o{{ {to_table} : "{from_column}"') + + inferred = schema_structure.get("inferred_relationships", []) + if inferred: + lines.append("\n %% -- Inferred Relationships --") + for rel in inferred: + from_table = rel.get("from_table", "").upper() + to_table = rel.get("to_table", "").upper() + from_column = rel.get("from_column", "") + + if from_table and to_table: + # Optional → Optional: }o--o{ + lines.append( + f' {from_table} }}o--o{{ {to_table} : "INFERRED: {from_column}"' + ) + + mermaid_script = "\n".join(lines) + "\n" + + return { + "status": "success", + "message": "Mermaid ERD script generated successfully. Paste this code into any Mermaid renderer.", + "script_type": "Mermaid", + "script": mermaid_script, + } diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py new file mode 100644 index 0000000..6da155b --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py @@ -0,0 +1,74 @@ +from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + +from .tools import get_schema_details + +schema_introspection_agent = LlmAgent( + model=MODEL, + name="schema_introspection_agent", + description="Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.", + instruction=""" + ### Role + You are a **Database Schema Introspection Agent**. Your sole task is to fetch and summarize the schema structure of a database. + + ### Scope + - You can only report **schema-level information**: tables, columns, constraints, indexes, foreign keys, inferred relationships, and anomalies. + - Do **not** answer questions about data content, queries, or performance. Forward all other questions to the QA agent using: + ```python + transfer_to_agent(qa_agent, query) + ``` + + ### Formatting + - Present table-like data using proper pipe tables: + +------------------+------------------+------------------+ + | Column 1 | Column 2 | Column 3 | + +------------------+------------------+------------------+ + | Row 1, Col 1 | Row 1, Col 2 | Row 1, Col 3 | + |------------------+------------------+------------------| + | Row 2, Col 1 | Row 2, Col 2 | Row 2, Col 3 | + |------------------+------------------+------------------| + | Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | + +------------------+------------------+------------------+ + + ### Task + + 1. **Receive Schema Name:** The user's query to this agent (available as the variable `query`) IS the schema name to be introspected. + + 2. **Call Tool:** Invoke the `get_schema_details` tool. You MUST pass the schema name as a dictionary to the `args` parameter of the tool. + - **Tool Call:** `get_schema_details(args={"schema_name": query})` + + 3. **Process Results:** + - If the tool call returns `status`: "success": + - Extract `schema_name` and `summary` from the tool's result. + - Construct a response to the user, confirming the schema and dynamically summarizing the findings based on the `summary` object. + + - **Response Template:** + "I have successfully introspected the schema '{tool_result.schema_name}'. Here's a summary of what I found: + - **Tables:** {tool_result.summary.tables} (with {tool_result.summary.columns} columns in total) + - **Views:** {tool_result.summary.views} + - **Constraints:** {tool_result.summary.constraints} (Across all tables) + - **Indexes:** {tool_result.summary.indexes} (Across all tables) + - **Explicit Foreign Keys:** {tool_result.summary.explicit_fks} + - **Potential Inferred Relationships:** {tool_result.summary.inferred_relationships} + - **Schema Relationship Anomalies Detected:** {tool_result.summary.anomalies} + + The full details are stored. What would you like to explore further about the '{tool_result.schema_name}' schema? I can help you with: + - 'List all tables.' + - 'Describe the table .' + - 'Show foreign keys involving the table.' + - 'Tell me about any anomalies found.' + - 'List any inferred relationships.'" + + - If the tool call returns an error, follow the **Error Handling** instruction above. + + ### IMPORTANT + - If there is anything which is not in your scope or you cannot answer transfer the query to the root agent calling transfer_to_agent(data_model_discovery_agent, query) + - For anything outside this scope, immediately call: + ```python + transfer_to_agent(qa_agent, query) + ``` + - Focus **only** on fetching and summarizing schema details. + """, + tools=[get_schema_details], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py new file mode 100644 index 0000000..69a51e4 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -0,0 +1,134 @@ +import logging +from typing import Any + +import mysql.connector + +# Import database connectors +import psycopg2 +import pyodbc +from google.adk.tools import ToolContext + +# Import utils +from .utils import mssql_utils, mysql_utils, postgresql_utils + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: + db_type = metadata.get("db_type") + host = metadata.get("host") + port = metadata.get("port") + dbname = metadata.get("dbname") + user = metadata.get("user") + + if not all([db_type, host, port, dbname, user, password is not None]): + raise ValueError( + "Missing one or more required connection parameters in metadata or password." + ) + port = int(port) # type: ignore[arg-type] + logger.info( + f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}" + ) + if db_type == "postgresql": + return psycopg2.connect( + host=host, port=port, dbname=dbname, user=user, password=password + ) + elif db_type == "mysql": + return mysql.connector.connect( + host=host, port=port, database=dbname, user=user, password=password + ) + elif db_type == "mssql": + conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" + return pyodbc.connect(conn_str) + else: + raise ValueError(f"Unsupported database type: {db_type}") + + +def _generate_summary(schema_details: dict[str, Any]) -> dict[str, int]: + """Generates a summary of the introspected schema structure.""" + summary = { + "tables": len(schema_details.get("tables", {})), + "views": len(schema_details.get("views", {})), + "explicit_fks": len(schema_details.get("foreign_keys", [])), + "inferred_relationships": len(schema_details.get("inferred_relationships", [])), + "anomalies": len(schema_details.get("anomalies", [])), + "columns": 0, + "constraints": 0, + "indexes": 0, + } + for table_info in schema_details.get("tables", {}).values(): + summary["columns"] += len(table_info.get("columns", {})) + summary["constraints"] += len(table_info.get("constraints", [])) + summary["indexes"] += len(table_info.get("indexes", [])) + return summary + + +async def get_schema_details( + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: + """ + Retrieves detailed schema information and a summary for the given schema_name. + Updates the session state with the selected_schema and schema_structure. + """ + schema_name = args.get("schema_name") + if not schema_name or not str(schema_name).strip(): + return {"error": "schema_name not provided in args or is empty."} + schema_name = str(schema_name).strip() + + db_conn_state = tool_context.state.get("db_connection") + db_creds = tool_context.state.get("db_creds_temp") + + if not db_conn_state or db_conn_state.get("status") != "connected": + return {"error": "Database not connected. Please connect first."} + if not db_creds: + return {"error": "Database credentials not found."} + + tool_context.state["selected_schema"] = schema_name + if "available_schemas" in tool_context.state: + tool_context.state["available_schemas"] = None + + metadata = db_conn_state["metadata"] + password = db_creds["password"] + db_type = metadata["db_type"] + + conn = None + try: + conn = _get_db_connection(metadata, password) + logger.info( + f"Successfully reconnected to {db_type} for introspection of schema '{schema_name}'." + ) + + if db_type == "postgresql": + schema_details = postgresql_utils.get_postgres_schema_details( + conn, schema_name + ) + elif db_type == "mysql": + schema_details = mysql_utils.get_mysql_schema_details(conn, schema_name) + elif db_type == "mssql": + schema_details = mssql_utils.get_mssql_schema_details(conn, schema_name) + else: + return {"error": f"Introspection for {db_type} is not implemented."} + + tool_context.state["schema_structure"] = schema_details + logger.info(f"Schema structure for '{schema_name}' saved to session state.") + + summary = _generate_summary(schema_details) + + return { + "status": "success", + "message": f"Schema details for '{schema_name}' ({db_type}) retrieved and stored.", + "schema_name": schema_name, + "summary": summary, # Include the summary + } + except Exception as e: + logger.error(f"Error during schema introspection: {e}", exc_info=True) + return { + "error": f"Failed to get schema details for {db_type} ({schema_name}): {e!s}" + } + finally: + if conn: + try: + conn.close() + except Exception as e: + logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py new file mode 100644 index 0000000..df061f8 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -0,0 +1,297 @@ +import json +import logging +import os +import re +from typing import Any + +import google.auth +import pyodbc +from google import genai +from google.api_core import exceptions +from google.genai import types + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] + +if not GOOGLE_CLOUD_PROJECT: + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") +MODEL = os.environ.get("MODEL", "gemini-1.5-pro") + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info( + f"GenAI Client initialized in mssql_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) + except Exception as e: + logger.error(f"Failed to initialize GenAI Client in mssql_utils: {e}") +else: + logger.error( + "Cannot initialize GenAI Client in mssql_utils: GOOGLE_CLOUD_PROJECT is not set." + ) + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for SQL Server.""" + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [column[0] for column in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row, strict=False)) for row in rows] + return [] + except pyodbc.Error as ex: + sqlstate = ex.args[0] + logger.error(f"SQL Error ({sqlstate}): {ex} for query: {query}") + raise + finally: + cursor.close() + + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []), + } + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []), + } + context_json = json.dumps(context, indent=4) + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + + +def _extract_json_content(text: str) -> str: + """Extracts JSON content from Markdown-style code fences (```json ... ```).""" + if not text: + return "" + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + extracted = match.group(1).strip() if match else text.strip() + return extracted + + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}], + } + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + logger.debug(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] + config=types.GenerateContentConfig(response_mime_type="application/json"), + ) + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] + logger.debug(f"****** Raw LLM Response: {generated_text}") + cleaned_json = _extract_json_content(generated_text) + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError( + "LLM response is not in the expected list format for keys." + ) + return {"inferred_relationships": inferred, "anomalies": anomalies} + except json.JSONDecodeError as e: + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], + } + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}], + } + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}", exc_info=True) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], + } + + +def get_mssql_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: + logger.info(f"Fetching MSSQL schema details for: {schema_name}") + details: dict[str, Any] = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } + + tables_query = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE';" + tables = _execute_query(conn, tables_query) + for table in tables: + t_name = table["TABLE_NAME"] + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + cols_query = f"SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}';" + for col in _execute_query(conn, cols_query): + details["tables"][t_name]["columns"][col["COLUMN_NAME"]] = { + "type": col["DATA_TYPE"], + "length": col["CHARACTER_MAXIMUM_LENGTH"], + "precision": col["NUMERIC_PRECISION"], + "scale": col["NUMERIC_SCALE"], + "nullable": col["IS_NULLABLE"] == "YES", + "default": col["COLUMN_DEFAULT"], + } + + constraints_query = f""" + SELECT KCU.TABLE_NAME, TC.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME, CC.CHECK_CLAUSE + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND KCU.TABLE_NAME = '{t_name}'; + """ + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) + + indexes_query = f""" + SELECT t.name AS table_name, ind.name AS index_name, COL_NAME(ic.object_id, ic.column_id) AS column_name, ind.is_unique + FROM sys.indexes ind INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id + INNER JOIN sys.tables t ON ind.object_id = t.object_id INNER JOIN sys.schemas s ON t.schema_id = s.schema_id + WHERE s.name = '{schema_name}' AND t.name = '{t_name}' AND ind.is_hypothetical = 0 AND ind.type > 0; + """ + try: + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + idx_name = index["index_name"] + if not idx_name: + continue + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["is_unique"], + } + if index["column_name"] not in grouped_indexes[idx_name]["columns"]: + grouped_indexes[idx_name]["columns"].append(index["column_name"]) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + except Exception as e: + logger.error(f"Error fetching MSSQL indexes for {t_name}: {e}") + + fks_query = f""" + SELECT KCU1.CONSTRAINT_NAME AS constraint_name, KCU1.TABLE_NAME AS from_table, KCU1.COLUMN_NAME AS from_column, + KCU2.TABLE_SCHEMA AS to_schema, KCU2.TABLE_NAME AS to_table, KCU2.COLUMN_NAME AS to_column + FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS RC + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 ON KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 ON KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION + WHERE KCU1.TABLE_SCHEMA = '{schema_name}'; + """ + details["foreign_keys"] = _execute_query(conn, fks_query) + views_query = f"SELECT TABLE_NAME AS view_name, VIEW_DEFINITION FROM INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA = '{schema_name}';" + details["views"] = { + view["view_name"]: {"definition": view["VIEW_DEFINITION"]} + for view in _execute_query(conn, views_query) + } + + llm_analysis = _analyze_with_llm(schema_name, "Microsoft SQL Server", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships for MSSQL." + ) + logger.info( + f"Found {len(details['anomalies'])} potential relationship anomalies for MSSQL." + ) + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py new file mode 100644 index 0000000..c244326 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -0,0 +1,317 @@ +import json +import logging +import os +import re +from typing import Any + +import google.auth +import mysql.connector +from google import genai +from google.api_core import exceptions +from google.genai import types + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] + +if not GOOGLE_CLOUD_PROJECT: + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") +MODEL = "gemini-2.5-pro" + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info( + f"GenAI Client initialized. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) + except Exception as e: + logger.error(f"Failed to initialize GenAI Client: {e}") +else: + logger.error("Cannot initialize GenAI Client: GOOGLE_CLOUD_PROJECT is not set.") + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts.""" + cursor = conn.cursor(dictionary=True) + try: + cursor.execute(query) + return cursor.fetchall() + finally: + cursor.close() + + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" + + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []), + } + + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []), + } + + # Format JSON for readability + context_json = json.dumps(context, indent=4) + + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + + +def _extract_json_content(text: str) -> str: + """ + Extracts JSON content from Markdown-style code fences (```json ... ```). + If no fences are present, returns the text as-is. + """ + if not text: + return "" + + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + if match: + extracted = match.group(1).strip() + else: + extracted = text.strip() + + try: + parsed = json.loads(extracted) + return json.dumps(parsed, indent=4) + except json.JSONDecodeError: + return extracted + + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}], + } + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + logger.debug(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] + ) + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] + logger.debug(f"****** Raw LLM Response: {generated_text}") + + # handles ```json blocks + cleaned_json = _extract_json_content(generated_text) + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) + + # Parse the cleaned JSON + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError( + "LLM response is not in the expected list format for keys." + ) + + return {"inferred_relationships": inferred, "anomalies": anomalies} + + except json.JSONDecodeError as e: + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], + } + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}], + } + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], + } + + +def get_mysql_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: + # For MySQL, schema_name is the database name. + logger.info(f"Fetching MySQL schema details for: {schema_name}") + try: + conn.database = schema_name + except mysql.connector.Error as err: + logger.error(f"MySQL change database failed: {err}") + raise + + details: dict[str, Any] = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } + + # 1. Fetch Basic Schema Info + tables_query = "SHOW FULL TABLES WHERE Table_type = 'BASE TABLE';" + tables = _execute_query(conn, tables_query) + table_names = [next(iter(t.values())) for t in tables] + + for t_name in table_names: + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + cols_query = f"DESCRIBE `{t_name}`;" + columns = _execute_query(conn, cols_query) + for col in columns: + details["tables"][t_name]["columns"][col["Field"]] = { + "type": col["Type"], + "nullable": col["Null"] == "YES", + "default": col["Default"], + "key": col["Key"], + "extra": col["Extra"], + } + + constraints_query = f""" + SELECT KCU.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND TC.TABLE_NAME = '{t_name}' + AND TC.CONSTRAINT_TYPE IN ('PRIMARY KEY', 'UNIQUE', 'FOREIGN KEY', 'CHECK'); + """ + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) + + indexes_query = f"SHOW INDEX FROM `{t_name}`;" + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + idx_name = index["Key_name"] + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["Non_unique"] == 0, + } + grouped_indexes[idx_name]["columns"].append(index["Column_name"]) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + + fks_query = f""" + SELECT KCU.TABLE_NAME AS from_table, KCU.COLUMN_NAME AS from_column, + KCU.REFERENCED_TABLE_NAME AS to_table, KCU.REFERENCED_COLUMN_NAME AS to_column, KCU.CONSTRAINT_NAME + FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + WHERE KCU.TABLE_SCHEMA = '{schema_name}' AND KCU.REFERENCED_TABLE_NAME IS NOT NULL; + """ + details["foreign_keys"] = _execute_query(conn, fks_query) + + views_query = "SHOW FULL TABLES WHERE Table_type = 'VIEW';" + views = _execute_query(conn, views_query) + for v_name in [next(iter(v.values())) for v in views]: + try: + definition_query = f"SHOW CREATE VIEW `{v_name}`;" + definition = _execute_query(conn, definition_query) + details["views"][v_name] = {"definition": definition[0]["Create View"]} + except Exception as e: + logger.warning(f"Could not fetch view definition for {v_name}: {e}") + details["views"][v_name] = {"definition": "N/A"} + + # 2. LLM-based Analysis for Inferred Relationships and Anomalies + llm_analysis = _analyze_with_llm(schema_name, "MySQL", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships." + ) + logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies.") + + logger.debug("************************") + logger.info(details) + logger.debug("************************") + + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py new file mode 100644 index 0000000..2b76be9 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -0,0 +1,296 @@ +import json +import logging +import os +import re +from typing import Any + +import google.auth +from google import genai +from google.api_core import exceptions +from google.genai import types + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] + +if not GOOGLE_CLOUD_PROJECT: + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") +MODEL = os.environ.get("MODEL", "gemini-2.5-pro") + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info( + f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) + except Exception as e: + logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") +else: + logger.error( + "Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set." + ) + + +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row, strict=False)) for row in rows] + return [] + finally: + cursor.close() + + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []), + } + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []), + } + context_json = json.dumps(context, indent=4) + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + + +def _extract_json_content(text: str) -> str: + """Extracts JSON content from Markdown-style code fences (```json ... ```).""" + if not text: + return "" + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + extracted = match.group(1).strip() if match else text.strip() + try: + parsed = json.loads(extracted) + return json.dumps(parsed, indent=4) + except json.JSONDecodeError: + return extracted + + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}], + } + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + logger.debug(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] + config=types.GenerateContentConfig(response_mime_type="application/json"), + ) + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] + logger.debug(f"****** Raw LLM Response: {generated_text}") + cleaned_json = _extract_json_content(generated_text) + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError( + "LLM response is not in the expected list format for keys." + ) + return {"inferred_relationships": inferred, "anomalies": anomalies} + except json.JSONDecodeError as e: + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], + } + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}], + } + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], + } + + +def get_postgres_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: + details: dict[str, Any] = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } + logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") + + tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE';" + tables = _execute_query(conn, tables_query) + for table in tables: + t_name = table["table_name"] + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + cols_query = f""" + SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default + FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; + """ + for col in _execute_query(conn, cols_query): + details["tables"][t_name]["columns"][col["column_name"]] = { + "type": col["data_type"], + "length": col["character_maximum_length"], + "precision": col["numeric_precision"], + "scale": col["numeric_scale"], + "nullable": col["is_nullable"] == "YES", + "default": col["column_default"], + } + constraints_query = f""" + SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause + FROM information_schema.table_constraints tc + LEFT JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name + LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema + WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; + """ + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) + indexes_query = f""" + SELECT t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique + FROM pg_class t JOIN pg_index ix ON t.oid = ix.indrelid JOIN pg_class i ON i.oid = ix.indexrelid + LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) + JOIN pg_namespace n ON t.relnamespace = n.oid WHERE n.nspname = '{schema_name}' AND t.relname = '{t_name}' AND t.relkind = 'r'; + """ + try: + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + if index["column_name"]: + idx_name = index["index_name"] + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["is_unique"], + } + if index["column_name"] not in grouped_indexes[idx_name]["columns"]: + grouped_indexes[idx_name]["columns"].append( + index["column_name"] + ) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + except Exception as e: + logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") + + fks_query = f""" + SELECT tc.constraint_name, tc.table_name AS from_table, kcu.column_name AS from_column, + ccu.table_schema AS to_schema, ccu.table_name AS to_table, ccu.column_name AS to_column + FROM information_schema.table_constraints AS tc JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema + WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = '{schema_name}'; + """ + details["foreign_keys"] = _execute_query(conn, fks_query) + views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" + for view in _execute_query(conn, views_query): + details["views"][view["view_name"]] = {"definition": view["view_definition"]} + + llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL." + ) + logger.info( + f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL." + ) + return details diff --git a/agent-app/pyproject.toml b/agent-app/pyproject.toml index 457526d..c13db66 100644 --- a/agent-app/pyproject.toml +++ b/agent-app/pyproject.toml @@ -14,6 +14,8 @@ dependencies = [ "uvicorn~=0.34.0", "psycopg2-binary>=2.9.10", "google-genai~=1.41.0", + "mysql-connector-python", + "pyodbc", "GitPython>=3.1.45", "google-cloud-storage", "reportlab", @@ -28,7 +30,6 @@ dependencies = [ "scipy~=1.15.0", "pygithub~=2.8.1", "googlesearch-python" - ] requires-python = ">=3.10,<3.14" diff --git a/agent-app/uv.lock b/agent-app/uv.lock index ece3f6f..e2d5e54 100644 --- a/agent-app/uv.lock +++ b/agent-app/uv.lock @@ -33,12 +33,14 @@ dependencies = [ { name = "googlesearch-python" }, { name = "markdown-it-py" }, { name = "markdown-pdf" }, + { name = "mysql-connector-python" }, { name = "opentelemetry-exporter-gcp-trace" }, { name = "pandas" }, { name = "pdfplumber" }, { name = "plantuml" }, { name = "psycopg2-binary" }, { name = "pygithub" }, + { name = "pyodbc" }, { name = "reportlab" }, { name = "scipy" }, { name = "tabulate" }, @@ -82,6 +84,7 @@ requires-dist = [ { name = "markdown-it-py", specifier = "==3.0.0" }, { name = "markdown-pdf", specifier = "==1.9" }, { name = "mypy", marker = "extra == 'lint'", specifier = "~=1.15.0" }, + { name = "mysql-connector-python" }, { name = "opentelemetry-exporter-gcp-trace", specifier = "~=1.9.0" }, { name = "pandas", specifier = "==2.3.2" }, { name = "pdfplumber" }, @@ -89,6 +92,7 @@ requires-dist = [ { name = "psycopg2-binary", specifier = ">=2.9.10" }, { name = "pygithub", specifier = "~=2.8.1" }, { name = "pylint", marker = "extra == 'lint'", specifier = ">=4.0.1" }, + { name = "pyodbc" }, { name = "reportlab" }, { name = "ruff", marker = "extra == 'lint'", specifier = ">=0.4.6" }, { name = "scipy", specifier = "~=1.15.0" }, @@ -2138,6 +2142,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963 }, ] +[[package]] +name = "mysql-connector-python" +version = "9.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/33/b332b001bc8c5ee09255a0d4b09a254da674450edd6a3e5228b245ca82a0/mysql_connector_python-9.5.0.tar.gz", hash = "sha256:92fb924285a86d8c146ebd63d94f9eaefa548da7813bc46271508fdc6cc1d596", size = 12251077 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/5d/30210fcf7ba98d1e03de0c47a58218ab5313d82f2e01ae53b47f45c36b9d/mysql_connector_python-9.5.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:77d14c9fde90726de22443e8c5ba0912a4ebb632cc1ade52a349dacbac47b140", size = 17579085 }, + { url = "https://files.pythonhosted.org/packages/77/92/ea79a0875436665330a81e82b4b73a6d52aebcfb1cf4d97f4ad4bd4dedf5/mysql_connector_python-9.5.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:4d603b55de310b9689bb3cb5e57fe97e98756e36d62f8f308f132f2c724f62b8", size = 18445098 }, + { url = "https://files.pythonhosted.org/packages/5f/f2/4578b5093f46985c659035e880e70e8b0bed44d4a59ad4e83df5d49b9c69/mysql_connector_python-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:48ffa71ba748afaae5c45ed9a085a72604368ce611fe81c3fdc146ef60181d51", size = 33660118 }, + { url = "https://files.pythonhosted.org/packages/c5/60/63135610ae0cee1260ce64874c1ddbf08e7fb560c21a3d9cce88b0ddc266/mysql_connector_python-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:77c71df48293d3c08713ff7087cf483804c8abf41a4bb4aefea7317b752c8e9a", size = 34096212 }, + { url = "https://files.pythonhosted.org/packages/3e/b1/78dc693552cfbb45076b3638ca4c402fae52209af8f276370d02d78367a0/mysql_connector_python-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:4f8d2d9d586c34dc9508a44d19cf30ccafabbbd12d7f8ab58da3af118636843c", size = 16512395 }, + { url = "https://files.pythonhosted.org/packages/05/03/77347d58b0027ce93a41858477e08422e498c6ebc24348b1f725ed7a67ae/mysql_connector_python-9.5.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:653e70cd10cf2d18dd828fae58dff5f0f7a5cf7e48e244f2093314dddf84a4b9", size = 17578984 }, + { url = "https://files.pythonhosted.org/packages/a5/bb/0f45c7ee55ebc56d6731a593d85c0e7f25f83af90a094efebfd5be9fe010/mysql_connector_python-9.5.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:5add93f60b3922be71ea31b89bc8a452b876adbb49262561bd559860dae96b3f", size = 18445067 }, + { url = "https://files.pythonhosted.org/packages/1c/ec/054de99d4aa50d851a37edca9039280f7194cc1bfd30aab38f5bd6977ebe/mysql_connector_python-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:20950a5e44896c03e3dc93ceb3a5e9b48c9acae18665ca6e13249b3fe5b96811", size = 33668029 }, + { url = "https://files.pythonhosted.org/packages/90/a2/e6095dc3a7ad5c959fe4a65681db63af131f572e57cdffcc7816bc84e3ad/mysql_connector_python-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:7fdd3205b9242c284019310fa84437f3357b13f598e3f9b5d80d337d4a6406b8", size = 34101687 }, + { url = "https://files.pythonhosted.org/packages/9c/88/bc13c33fca11acaf808bd1809d8602d78f5bb84f7b1e7b1a288c383a14fd/mysql_connector_python-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:c021d8b0830958b28712c70c53b206b4cf4766948dae201ea7ca588a186605e0", size = 16511749 }, + { url = "https://files.pythonhosted.org/packages/02/89/167ebee82f4b01ba7339c241c3cc2518886a2be9f871770a1efa81b940a0/mysql_connector_python-9.5.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a72c2ef9d50b84f3c567c31b3bf30901af740686baa2a4abead5f202e0b7ea61", size = 17581904 }, + { url = "https://files.pythonhosted.org/packages/67/46/630ca969ce10b30fdc605d65dab4a6157556d8cc3b77c724f56c2d83cb79/mysql_connector_python-9.5.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:bd9ba5a946cfd3b3b2688a75135357e862834b0321ed936fd968049be290872b", size = 18448195 }, + { url = "https://files.pythonhosted.org/packages/f6/87/4c421f41ad169d8c9065ad5c46673c7af889a523e4899c1ac1d6bfd37262/mysql_connector_python-9.5.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5ef7accbdf8b5f6ec60d2a1550654b7e27e63bf6f7b04020d5fb4191fb02bc4d", size = 33668638 }, + { url = "https://files.pythonhosted.org/packages/a6/01/67cf210d50bfefbb9224b9a5c465857c1767388dade1004c903c8e22a991/mysql_connector_python-9.5.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6e0a4a0274d15e3d4c892ab93f58f46431222117dba20608178dfb2cc4d5fd8", size = 34102899 }, + { url = "https://files.pythonhosted.org/packages/cd/ef/3d1a67d503fff38cc30e11d111cf28f0976987fb175f47b10d44494e1080/mysql_connector_python-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:b6c69cb37600b7e22f476150034e2afbd53342a175e20aea887f8158fc5e3ff6", size = 16512684 }, + { url = "https://files.pythonhosted.org/packages/72/18/f221aeac49ce94ac119a427afbd51fe1629d48745b571afc0de49647b528/mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:1f5f7346b0d5edb2e994c1bd77b3f5eed88b0ca368ad6788d1012c7e56d7bf68", size = 17581933 }, + { url = "https://files.pythonhosted.org/packages/de/8e/14d44db7353350006a12e46d61c3a995bba06acd7547fc78f9bb32611e0c/mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:07bf52591b4215cb4318b4617c327a6d84c31978c11e3255f01a627bcda2618e", size = 18448446 }, + { url = "https://files.pythonhosted.org/packages/6b/f5/ab306f292a99bff3544ff44ad53661a031dc1a11e5b1ad64b9e5b5290ef9/mysql_connector_python-9.5.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:8972c1f960b30d487f34f9125ec112ea2b3200bd02c53e5e32ee7a43be6d64c1", size = 33668933 }, + { url = "https://files.pythonhosted.org/packages/e8/ee/d146d2642552ebb5811cf551f06aca7da536c80b18fb6c75bdbc29723388/mysql_connector_python-9.5.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f6d32d7aa514d2f6f8709ba1e018314f82ab2acea2e6af30d04c1906fe9171b9", size = 34103214 }, + { url = "https://files.pythonhosted.org/packages/e7/f8/5e88e5eda1fe58f7d146b73744f691d85dce76fb42e7ce3de53e49911da3/mysql_connector_python-9.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:edd47048eb65c196b28aa9d2c0c6a017d8ca084a9a7041cd317301c829eb5a05", size = 16512689 }, + { url = "https://files.pythonhosted.org/packages/95/e1/45373c06781340c7b74fe9b88b85278ac05321889a307eaa5be079a997d4/mysql_connector_python-9.5.0-py2.py3-none-any.whl", hash = "sha256:ace137b88eb6fdafa1e5b2e03ac76ce1b8b1844b3a4af1192a02ae7c1a45bdee", size = 479047 }, +] + [[package]] name = "nbclient" version = "0.10.2" @@ -3159,6 +3192,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970 }, ] +[[package]] +name = "pyodbc" +version = "5.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8f/85/44b10070a769a56bd910009bb185c0c0a82daff8d567cd1a116d7d730c7d/pyodbc-5.3.0.tar.gz", hash = "sha256:2fe0e063d8fb66efd0ac6dc39236c4de1a45f17c33eaded0d553d21c199f4d05", size = 121770 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/cd/d0ac9e8963cf43f3c0e8ebd284cd9c5d0e17457be76c35abe4998b7b6df2/pyodbc-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6682cdec78f1302d0c559422c8e00991668e039ed63dece8bf99ef62173376a5", size = 71888 }, + { url = "https://files.pythonhosted.org/packages/cb/7b/95ea2795ea8a0db60414e14f117869a5ba44bd52387886c1a210da637315/pyodbc-5.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9cd3f0a9796b3e1170a9fa168c7e7ca81879142f30e20f46663b882db139b7d2", size = 71813 }, + { url = "https://files.pythonhosted.org/packages/95/c9/6f4644b60af513ea1c9cab1ff4af633e8f300e8468f4ae3507f04524e641/pyodbc-5.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46185a1a7f409761716c71de7b95e7bbb004390c650d00b0b170193e3d6224bb", size = 318556 }, + { url = "https://files.pythonhosted.org/packages/19/3f/24876d9cb9c6ce1bd2b6f43f69ebc00b8eb47bf1ed99ee95e340bf90ed79/pyodbc-5.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:349a9abae62a968b98f6bbd23d2825151f8d9de50b3a8f5f3271b48958fdb672", size = 322048 }, + { url = "https://files.pythonhosted.org/packages/1f/27/faf17353605ac60f80136bc3172ed2d69d7defcb9733166293fc14ac2c52/pyodbc-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ac23feb7ddaa729f6b840639e92f83ff0ccaa7072801d944f1332cd5f5b05f47", size = 1286123 }, + { url = "https://files.pythonhosted.org/packages/d4/61/c9d407d2aa3e89f9bb68acf6917b0045a788ae8c3f4045c34759cb77af63/pyodbc-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8aa396c6d6af52ccd51b8c8a5bffbb46fd44e52ce07ea4272c1d28e5e5b12722", size = 1343502 }, + { url = "https://files.pythonhosted.org/packages/d9/9f/f1b0f3238d873d4930aa2a2b8d5ba97132f6416764bf0c87368f8d6f2139/pyodbc-5.3.0-cp310-cp310-win32.whl", hash = "sha256:46869b9a6555ff003ed1d8ebad6708423adf2a5c88e1a578b9f029fb1435186e", size = 62968 }, + { url = "https://files.pythonhosted.org/packages/d8/26/5f8ebdca4735aad0119aaaa6d5d73b379901b7a1dbb643aaa636040b27cf/pyodbc-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:705903acf6f43c44fc64e764578d9a88649eb21bf7418d78677a9d2e337f56f2", size = 69397 }, + { url = "https://files.pythonhosted.org/packages/d1/c8/480a942fd2e87dd7df6d3c1f429df075695ed8ae34d187fe95c64219fd49/pyodbc-5.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:c68d9c225a97aedafb7fff1c0e1bfe293093f77da19eaf200d0e988fa2718d16", size = 64446 }, + { url = "https://files.pythonhosted.org/packages/e0/c7/534986d97a26cb8f40ef456dfcf00d8483161eade6d53fa45fcf2d5c2b87/pyodbc-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebc3be93f61ea0553db88589e683ace12bf975baa954af4834ab89f5ee7bf8ae", size = 71958 }, + { url = "https://files.pythonhosted.org/packages/69/3c/6fe3e9eae6db1c34d6616a452f9b954b0d5516c430f3dd959c9d8d725f2a/pyodbc-5.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9b987a25a384f31e373903005554230f5a6d59af78bce62954386736a902a4b3", size = 71843 }, + { url = "https://files.pythonhosted.org/packages/44/0e/81a0315d0bf7e57be24338dbed616f806131ab706d87c70f363506dc13d5/pyodbc-5.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:676031723aac7dcbbd2813bddda0e8abf171b20ec218ab8dfb21d64a193430ea", size = 327191 }, + { url = "https://files.pythonhosted.org/packages/43/ae/b95bb2068f911950322a97172c68675c85a3e87dc04a98448c339fcbef21/pyodbc-5.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5c30c5cd40b751f77bbc73edd32c4498630939bcd4e72ee7e6c9a4b982cc5ca", size = 332228 }, + { url = "https://files.pythonhosted.org/packages/dc/21/2433625f7d5922ee9a34e3805805fa0f1355d01d55206c337bb23ec869bf/pyodbc-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2035c7dfb71677cd5be64d3a3eb0779560279f0a8dc6e33673499498caa88937", size = 1296469 }, + { url = "https://files.pythonhosted.org/packages/3a/f4/c760caf7bb9b3ab988975d84bd3e7ebda739fe0075c82f476d04ee97324c/pyodbc-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5cbe4d753723c8a8f65020b7a259183ef5f14307587165ce37e8c7e251951852", size = 1353163 }, + { url = "https://files.pythonhosted.org/packages/14/ad/f9ca1e9e44fd91058f6e35b233b1bb6213d590185bfcc2a2c4f1033266e7/pyodbc-5.3.0-cp311-cp311-win32.whl", hash = "sha256:d255f6b117d05cfc046a5201fdf39535264045352ea536c35777cf66d321fbb8", size = 62925 }, + { url = "https://files.pythonhosted.org/packages/e6/cf/52b9b94efd8cfd11890ae04f31f50561710128d735e4e38a8fbb964cd2c2/pyodbc-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:f1ad0e93612a6201621853fc661209d82ff2a35892b7d590106fe8f97d9f1f2a", size = 69329 }, + { url = "https://files.pythonhosted.org/packages/8b/6f/bf5433bb345007f93003fa062e045890afb42e4e9fc6bd66acc2c3bd12ca/pyodbc-5.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:0df7ff47fab91ea05548095b00e5eb87ed88ddf4648c58c67b4db95ea4913e23", size = 64447 }, + { url = "https://files.pythonhosted.org/packages/f5/0c/7ecf8077f4b932a5d25896699ff5c394ffc2a880a9c2c284d6a3e6ea5949/pyodbc-5.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5ebf6b5d989395efe722b02b010cb9815698a4d681921bf5db1c0e1195ac1bde", size = 72994 }, + { url = "https://files.pythonhosted.org/packages/03/78/9fbde156055d88c1ef3487534281a5b1479ee7a2f958a7e90714968749ac/pyodbc-5.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:197bb6ddafe356a916b8ee1b8752009057fce58e216e887e2174b24c7ab99269", size = 72535 }, + { url = "https://files.pythonhosted.org/packages/9f/f9/8c106dcd6946e95fee0da0f1ba58cd90eb872eebe8968996a2ea1f7ac3c1/pyodbc-5.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c6ccb5315ec9e081f5cbd66f36acbc820ad172b8fa3736cf7f993cdf69bd8a96", size = 333565 }, + { url = "https://files.pythonhosted.org/packages/4b/30/2c70f47a76a4fafa308d148f786aeb35a4d67a01d41002f1065b465d9994/pyodbc-5.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dd3d5e469f89a3112cf8b0658c43108a4712fad65e576071e4dd44d2bd763c7", size = 340283 }, + { url = "https://files.pythonhosted.org/packages/7d/b2/0631d84731606bfe40d3b03a436b80cbd16b63b022c7b13444fb30761ca8/pyodbc-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b180bc5e49b74fd40a24ef5b0fe143d0c234ac1506febe810d7434bf47cb925b", size = 1302767 }, + { url = "https://files.pythonhosted.org/packages/74/b9/707c5314cca9401081b3757301241c167a94ba91b4bd55c8fa591bf35a4a/pyodbc-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e3c39de3005fff3ae79246f952720d44affc6756b4b85398da4c5ea76bf8f506", size = 1361251 }, + { url = "https://files.pythonhosted.org/packages/97/7c/893036c8b0c8d359082a56efdaa64358a38dda993124162c3faa35d1924d/pyodbc-5.3.0-cp312-cp312-win32.whl", hash = "sha256:d32c3259762bef440707098010035bbc83d1c73d81a434018ab8c688158bd3bb", size = 63413 }, + { url = "https://files.pythonhosted.org/packages/c0/70/5e61b216cc13c7f833ef87f4cdeab253a7873f8709253f5076e9bb16c1b3/pyodbc-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:fe77eb9dcca5fc1300c9121f81040cc9011d28cff383e2c35416e9ec06d4bc95", size = 70133 }, + { url = "https://files.pythonhosted.org/packages/aa/85/e7d0629c9714a85eb4f85d21602ce6d8a1ec0f313fde8017990cf913e3b4/pyodbc-5.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:afe7c4ac555a8d10a36234788fc6cfc22a86ce37fc5ba88a1f75b3e6696665dc", size = 64700 }, + { url = "https://files.pythonhosted.org/packages/0c/1d/9e74cbcc1d4878553eadfd59138364b38656369eb58f7e5b42fb344c0ce7/pyodbc-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7e9ab0b91de28a5ab838ac4db0253d7cc8ce2452efe4ad92ee6a57b922bf0c24", size = 72975 }, + { url = "https://files.pythonhosted.org/packages/37/c7/27d83f91b3144d3e275b5b387f0564b161ddbc4ce1b72bb3b3653e7f4f7a/pyodbc-5.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6132554ffbd7910524d643f13ce17f4a72f3a6824b0adef4e9a7f66efac96350", size = 72541 }, + { url = "https://files.pythonhosted.org/packages/1b/33/2bb24e7fc95e98a7b11ea5ad1f256412de35d2e9cc339be198258c1d9a76/pyodbc-5.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1629af4706e9228d79dabb4863c11cceb22a6dab90700db0ef449074f0150c0d", size = 343287 }, + { url = "https://files.pythonhosted.org/packages/fa/24/88cde8b6dc07a93a92b6c15520a947db24f55db7bd8b09e85956642b7cf3/pyodbc-5.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ceaed87ba2ea848c11223f66f629ef121f6ebe621f605cde9cfdee4fd9f4b68", size = 350094 }, + { url = "https://files.pythonhosted.org/packages/c2/99/53c08562bc171a618fa1699297164f8885e66cde38c3b30f454730d0c488/pyodbc-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3cc472c8ae2feea5b4512e23b56e2b093d64f7cbc4b970af51da488429ff7818", size = 1301029 }, + { url = "https://files.pythonhosted.org/packages/d8/10/68a0b5549876d4b53ba4c46eed2a7aca32d589624ed60beef5bd7382619e/pyodbc-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c79df54bbc25bce9f2d87094e7b39089c28428df5443d1902b0cc5f43fd2da6f", size = 1361420 }, + { url = "https://files.pythonhosted.org/packages/41/0f/9dfe4987283ffcb981c49a002f0339d669215eb4a3fe4ee4e14537c52852/pyodbc-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c2eb0b08e24fe5c40c7ebe9240c5d3bd2f18cd5617229acee4b0a0484dc226f2", size = 63399 }, + { url = "https://files.pythonhosted.org/packages/56/03/15dcefe549d3888b649652af7cca36eda97c12b6196d92937ca6d11306e9/pyodbc-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:01166162149adf2b8a6dc21a212718f205cabbbdff4047dc0c415af3fd85867e", size = 70133 }, + { url = "https://files.pythonhosted.org/packages/c4/c1/c8b128ae59a14ecc8510e9b499208e342795aecc3af4c3874805c720b8db/pyodbc-5.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:363311bd40320b4a61454bebf7c38b243cd67c762ed0f8a5219de3ec90c96353", size = 64683 }, +] + [[package]] name = "pyparsing" version = "3.2.3"