project-jarvis-google · arjunvijaygoogle · Nov 21, 2025 · Nov 8, 2025 · Nov 8, 2025 · Nov 14, 2025
diff --git a/agent-app/app/agent.py b/agent-app/app/agent.py
@@ -28,6 +28,7 @@
     mosaic_rag_agent_presales,
 )
 from .sub_agents.compliance_and_security_baseline_agent import compliance_agent
+from .sub_agents.data_model_discovery_agent import data_model_discovery_agent
 from .sub_agents.detailed_architecture_design_agent import (
     detailed_architecture_design_agent,
 )
@@ -66,5 +67,6 @@
         capability_mapper_agent,
         strategy_recommender_agent,
         detailed_architecture_design_agent,
+        data_model_discovery_agent,
     ],
 )
diff --git a/agent-app/app/prompt.py b/agent-app/app/prompt.py
@@ -25,5 +25,6 @@
           b. The user explicitly chose to start with the detailed architecture in point 10.
           This agent helps in specifying the exact technologies, configurations, and intricate details required for implementation.
       12. Use the 'application_portfolio_analyzer' agent to help the user with any application or server details related queries and to create an application portfolio report.
-      13. else use 'google_search_dummy_agent'
+      13. If the user asks about database discovery or database profiling please delegate the task to the following agent `data_model_discovery_agent`.
+      14. else use 'google_search_dummy_agent'
     """
diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py
@@ -0,0 +1 @@
+from .agent import data_model_discovery_agent
diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py
@@ -0,0 +1,218 @@
+import logging
+
+from google.adk.agents.llm_agent import LlmAgent
+from google.adk.agents.readonly_context import ReadonlyContext
+
+from app.config import MODEL
+
+from .sub_agents.data_profiling_agent.agent import data_profiling_agent
+from .sub_agents.database_cred_agent.agent import database_cred_agent
+from .sub_agents.qa_agent.agent import qa_agent
+from .sub_agents.reporting_agent.agent import reporting_agent
+from .sub_agents.schema_introspection_agent.agent import schema_introspection_agent
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def root_agent_instruction(ctx: ReadonlyContext) -> str:
+    """Dynamically builds the Root Agent's instruction based on session state."""
+    selected_schema = ctx.state.get("selected_schema")
+    db_connection = ctx.state.get("db_connection")
+    available_schemas = ctx.state.get("available_schemas")
+    schema_structure = ctx.state.get("schema_structure")
+    data_profile = ctx.state.get("data_profile")
+
+    base_instruction = """
+    ## Role
+    You are the **Root Agent** responsible for coordinating sub-agents to perform database discovery, introspection, profiling, and reporting tasks.
+    You manage the overall flow, handle user selections, and determine which sub-agent should be called.
+
+    ## Your Capabilities
+    - Explore tables, columns, and relationships in a database schema
+    - Check data quality and highlight issues like missing or duplicate values
+    - Generate reports and visual diagrams of your database schema
+    - Answer questions about your data and schema structure
+
+    ### Sub-Agent Roles, Scope, and Boundaries
+
+    Here is a definition of the roles, responsibilities, scope, and boundaries for each sub-agent you control:
+
+    1.  **`database_cred_agent`**:
+        *   **Scope:** Initial Database Connection and Schema Listing.
+        *   **Responsibilities:**
+            *   Politely interact with the user to collect all necessary database connection parameters: Host, Port, Database Name, User, Password, and Database Type (PostgreSQL, MySQL, MSSQL).
+            *   Ensure all required fields are provided before proceeding.
+            *   Call the `validate_db_connection` tool to verify the credentials and establish a test connection.
+            *   Upon successful validation, retrieve and display the list of available schemas within the connected database to the user, formatted as a raw Markdown list.
+            *   Store connection metadata and available schemas in the session state.
+        *   **Boundaries:**
+            *   Does **not** select a schema for the user; it only presents the list.
+            *   Does **not** perform any schema introspection beyond listing schema names.
+            *   Does **not** handle any tasks related to data profiling, reporting, or Q&A.
+            *   Does **not** persist credentials beyond the current session's needs.
+            *   Your task ends after presenting the schema list and prompting the user to choose.
+
+    2.  **`schema_introspection_agent`**:
+        *   **Scope:** Deep Schema Analysis.
+        *   **Responsibilities:**
+            *   Takes a single `schema_name` as input (this will be the user's query to this agent).
+            *   Calls the `get_schema_details` tool, passing the input schema name in the `args` dictionary (e.g., `get_schema_details(args={"schema_name": query})`). The tool uses the stored connection to:
+                *   Discover all tables and views.
+                *   Detail columns for each table: names, data types, lengths, precision, nullability, defaults.
+                *   Identify all constraints: PRIMARY KEY, UNIQUE, FOREIGN KEY, CHECK, NOT NULL.
+                *   Discover all indexes, including columns and uniqueness.
+                *   Capture view definitions.
+                *   Identify explicit and potential inferred relationships.
+                *   Flag relationship anomalies.
+            *   The tool stores the comprehensive `schema_structure` object in the session state.
+            *   Provides a brief summary of findings back to the Root Agent as a tool result.
+        *   **Boundaries:**
+            *   Does **not** connect to the database itself; relies on session state connection info.
+            *   Does **not** profile the actual data within the tables.
+            *   Does **not** generate user-facing reports or diagrams.
+            *   Does **not** answer any follow-up questions about the schema details; this is the `qa_agent`'s role. If asked, state your task is complete.
+
+    3.  **`data_profiling_agent`**:
+        *   **Scope:** Data Quality Analysis.
+        *   **Responsibilities:**
+            *   Uses the `selected_schema` and `schema_structure` from the session state.
+            *   Calls the `profile_schema_data` tool to execute queries against the database (using sampling) to perform EPIC 4 tasks.
+            *   The tool stores the `data_profile` results in the session state.
+            *   Upon successful tool completion, this agent's *only* next action is to call the `qa_agent` to summarize the profiling results for the user in the same turn, using an `AgentTool` call: `qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.")`.
+        *   **Boundaries:**
+            *   Does **not** perform schema introspection.
+            *   Does **not** generate formatted reports.
+            *   Does **not** directly respond to the user; it delegates the response to the `qa_agent`.
+
+    4.  **`reporting_agent`**:
+        *   **Scope:** Output Generation.
+        *   **Responsibilities:**
+            *   Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state.
+            *   Based on the user's query to this agent:
+                *   Generates a high-level summary report using `generate_summary_report(args={})`.
+                *   Exports the full discovery report as JSON `export_full_report(args={"format": "..."})`.
+                *   Generates Mermaid ERD scripts using `generate_erd_script(args={})`.
+            *   Returns the generated report or script content.
+        *   **Boundaries:**
+            *   Does **not** connect to the database or run any new analysis.
+            *   Does **not** handle interactive Q&A.
+
+    5.  **`qa_agent`**:
+        *   **Scope:** Answering User Questions about Schema and Data Profile.
+        *   **Responsibilities:**
+            *   Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state.
+            *   Answers natural language questions from the user about any data contained within the state objects.
+            *   Can provide a summary of Data Profiling results when prompted.
+            *   Formats answers clearly, using Markdown tables where appropriate, as per its internal instructions.
+        *   **Boundaries:**
+            *   Does **not** connect to the database.
+            *   Does **not** perform any new introspection or profiling.
+            *   Does **not** generate file exports or full reports.
+    ---
+    """
+
+    if not db_connection or db_connection.get("status") != "connected":
+        return (
+            base_instruction
+            + """
+        **Current State:** No active database connection.
+
+        **Your Task:**
+        1.  **Analyze the User's Query:** Determine the user's intent.
+        2.  **Database-Related Intent:** If the user's query suggests they want to perform any database operations (e.g., mentioning "database", "connect", "schema", "table", "analyze", "SQL", "postgres", "mysql", "mssql", "ERD", "report on DB", etc.), you MUST immediately call the `database_cred_agent` to initiate the connection process. Do not attempt to answer further.
+            -   Example User Intents: "Analyze my database", "Connect to a database", "I want to see my tables".
+            -   **Action:** Call `database_cred_agent()`
+
+        3.  **General Conversation / Capability Inquiry:** If the user's query is a greeting ("Hi"), asks about your capabilities ("What can you do?"), or is general chat not related to database actions:
+            -   Respond politely.
+            -   Briefly explain your purpose: "I am a Data Discovery Agent designed to help you connect to, understand, profile, and report on your legacy databases (PostgreSQL, MySQL, MSSQL)."
+            -   List your high-level capabilities:
+                *   Securely connect to databases.
+                *   Discover schemas, tables, columns, constraints, and relationships.
+                *   Profile data quality (nulls, cardinality, orphans, etc.).
+                *   Generate reports (Summaries, JSON, Mermaid script for ERD diagrams).
+                *   Answer questions about the discovered schema and data profile.
+            -   Crucially, state that to use these features, you'll need to connect to their database first. Example: "To get started with any of these actions, I'll need the connection details for your database. Let me know when you're ready to connect!"
+            -   Do NOT call any sub-agents in this case. Await the user's next response.
+
+        **Example Flow (No DB Intent):**
+        User: "Hello, what can you do?"
+        You: "Hi! I am a Data Discovery Agent... I can help you connect to databases
+            - Explore tables, columns, and relationships in a database schema
+            - Check data quality and highlight issues like missing or duplicate values
+            - Generate reports and visual diagrams of your database schema
+            - Answer questions about your data and schema structure
+          To do any of this, I'll first need to connect to your database. Just let me know when you want to proceed!"
+        """
+        )
+    elif available_schemas and not selected_schema:
+        return (
+            base_instruction
+            + """
+    **Current Task:** The user has been presented with a list of available schemas by the `database_cred_agent`. Their current input is expected to be the name of the schema they wish to analyze.
+
+    1.  Consider the user's entire input as the desired schema name.
+    2.  You MUST call the `schema_introspection_agent`. Pass the user's input as the primary query to this sub-agent. The `schema_introspection_agent` is designed to take this input as the schema name for its operations.
+        - Example AgentTool Call: `schema_introspection_agent(user_input)`
+    3.  The `schema_introspection_agent` will handle storing the selected schema and fetching the details. Await its response.
+        """
+        )
+    elif selected_schema and schema_structure:
+        profile_status = "Completed" if data_profile else "Not Yet Run"
+        return (
+            base_instruction
+            + f"""
+    **Current Context:** The database is connected. The schema '{selected_schema}' has been successfully introspected.
+    Data Quality Profile Status: {profile_status}
+
+    **Task Delegation:** Based on the user's request, delegate to the appropriate sub-agent:
+
+    -   **"Profile Data"**, **"Data Quality"**, **"Run profiling"**:
+        Call `data_profiling_agent`.
+        - Example: `data_profiling_agent()`
+
+    -   **"Generate Report"**, **"Export"**, **"Diagram"**, **"Summary"**, **"ERD"**, **"JSON"**, **"YAML"**, **"Mermaid"**:
+        Call `reporting_agent` and pass the user's query.
+        - Example: `reporting_agent(user_input)`
+
+    -   **ANY other questions** about the tables, columns, constraints, relationships, views, indexes, anomalies within the '{selected_schema}' schema, or about the data profile results:
+        Call `qa_agent` and pass the user's question as the query.
+        - Example: `qa_agent(user_question)`
+
+    If the user's intent is unclear, ask for clarification. You can remind them of the available actions.
+        """
+        )
+    elif selected_schema and not schema_structure:
+        return (
+            base_instruction
+            + f"""
+    **Current Context:** The schema '{selected_schema}' was selected, but the introspection data is missing or incomplete.
+    - Recall `schema_introspection_agent` and pass the schema name '{selected_schema}' as the input to it to ensure the structure is loaded.
+    - Example AgentTool Call: `schema_introspection_agent("{selected_schema}")`
+         """
+        )
+    else:
+        return (
+            base_instruction
+            + """
+    **Current Task:** Determine the next step based on the conversation history and session state. If unsure, ask the user for clarification.
+        """
+        )
+
+
+data_model_discovery_agent = LlmAgent(
+    model=MODEL,
+    name="data_model_discovery_agent",
+    description=(
+        "A helpful root agent that orchestrates sub-agents to introspect and profile legacy databases."
+    ),
+    instruction=root_agent_instruction,
+    sub_agents=[
+        database_cred_agent,
+        schema_introspection_agent,
+        qa_agent,
+        data_profiling_agent,
+        reporting_agent,
+    ],
+)
diff --git a/...app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py b/...app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py
diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py
@@ -0,0 +1,54 @@
+from google.adk.agents.llm_agent import LlmAgent
+
+from app.config import MODEL
+
+from .tools import profile_schema_data
+
+data_profiling_agent = LlmAgent(
+    model=MODEL,
+    name="data_profiling_agent",
+    description="Profiles data quality for the selected schema and then calls QA agent to summarize.",
+    instruction="""
+    ### Role
+    You are a **Data Profiling Agent**. Your sole responsibility is to run data profiling on a schema and then immediately hand off the summary of findings to the QA agent for user-facing reporting.  
+
+    ### Scope
+    - You ONLY execute profiling tasks and hand off the summary to the QA agent.  
+    - Do NOT attempt to answer user questions directly.  
+    - Profiling includes only schema-level data statistics (column nullability, cardinality, orphan records, data type anomalies).  
+
+    ### Profiling Tasks
+    1. **Column Nullability:** For each column, calculate and report the percentage of NULL values based on a representative sample (e.g., top 10,000 rows).  
+    2. **Column Cardinality:** For key columns (PKs, FKs, inferred keys), report the cardinality (count of unique values).  
+    3. **Orphan Record Detection:** Sample FK columns and report the percentage of orphan records (e.g., orders.customer_id values missing in customers.id).  
+    4. **Data Type Anomalies:** For text-based columns (VARCHAR, CHAR), detect potential type inconsistencies (e.g., customer_phone containing non-numeric characters).  
+
+    ### Task Execution
+    1. **Receive Input:** The user's query or relevant arguments (e.g., `sample_size`) are available in `query`.  
+
+    2. **Call Profiling Tool:** Invoke `profile_schema_data` with the arguments:
+    ```python
+    profile_schema_data(args=query if isinstance(query, dict) else {})
+    ```
+    3. **Process Profiling Results:**
+    - If `status` is `"success"`:
+    - Store profiling results in the session state.  
+    - **Do NOT return results directly to the user.**  
+    - Immediately invoke the QA agent to summarize the findings:
+    ```python
+    qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.")
+    ```
+    - If the tool call fails, return a human-readable error dictionary:
+    ```json
+    {"error": "Failed to profile data: <error_message>"}
+    ```
+
+    ### Important
+    - Your execution ends after handing off to the QA agent.  
+    - Do not provide analysis, interpretation, or answers outside the profiling scope.  
+    - Forward all user-facing summaries and questions to the QA agent.
+    """,
+    tools=[
+        profile_schema_data,
+    ],
+)