From 6b2e23c7ecef6733a5c4c815ff1009156f7cafc2 Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Sat, 8 Nov 2025 14:30:47 +0530 Subject: [PATCH 1/8] feat(data-model-discovery-agent): implementation for data_model_discovery_agent with its tools and sub_agents code refactoring --- agent-app/app/agent.py | 3 +++ agent-app/app/prompt.py | 5 ++-- agent-app/pyproject.toml | 3 ++- agent-app/uv.lock | 58 +++++++++++++++++++++++----------------- 4 files changed, 41 insertions(+), 28 deletions(-) diff --git a/agent-app/app/agent.py b/agent-app/app/agent.py index 3592f28..63c581f 100644 --- a/agent-app/app/agent.py +++ b/agent-app/app/agent.py @@ -38,6 +38,8 @@ from .sub_agents.recommendation_agent import recommendation_agent from .sub_agents.strategy_recommender_agent import strategy_recommender_agent from .sub_agents.tech_stack_profiler_agent import tech_stack_profiler +from .sub_agents.data_model_discovery_agent import data_model_discovery_agent + from .tools import ( transfer_to_capability_mapper_agent_tool, transfer_to_discovery_agent_tool, @@ -66,5 +68,6 @@ capability_mapper_agent, strategy_recommender_agent, detailed_architecture_design_agent, + data_model_discovery_agent ], ) diff --git a/agent-app/app/prompt.py b/agent-app/app/prompt.py index 1ed4fdd..79d443a 100644 --- a/agent-app/app/prompt.py +++ b/agent-app/app/prompt.py @@ -25,5 +25,6 @@ b. The user explicitly chose to start with the detailed architecture in point 10. This agent helps in specifying the exact technologies, configurations, and intricate details required for implementation. 12. Use the 'application_portfolio_analyzer' agent to help the user with any application or server details related queries and to create an application portfolio report. - 13. else use 'google_search_dummy_agent' - """ + 13. If the user asks about database discovery or database profiling please delegate the task to the following agent `data_model_discovery_agent`. + 14. else use 'google_search_dummy_agent' + """ \ No newline at end of file diff --git a/agent-app/pyproject.toml b/agent-app/pyproject.toml index 457526d..c13db66 100644 --- a/agent-app/pyproject.toml +++ b/agent-app/pyproject.toml @@ -14,6 +14,8 @@ dependencies = [ "uvicorn~=0.34.0", "psycopg2-binary>=2.9.10", "google-genai~=1.41.0", + "mysql-connector-python", + "pyodbc", "GitPython>=3.1.45", "google-cloud-storage", "reportlab", @@ -28,7 +30,6 @@ dependencies = [ "scipy~=1.15.0", "pygithub~=2.8.1", "googlesearch-python" - ] requires-python = ">=3.10,<3.14" diff --git a/agent-app/uv.lock b/agent-app/uv.lock index ece3f6f..05d78ce 100644 --- a/agent-app/uv.lock +++ b/agent-app/uv.lock @@ -33,11 +33,13 @@ dependencies = [ { name = "googlesearch-python" }, { name = "markdown-it-py" }, { name = "markdown-pdf" }, + { name = "mysql-connector-python" }, { name = "opentelemetry-exporter-gcp-trace" }, { name = "pandas" }, { name = "pdfplumber" }, { name = "plantuml" }, { name = "psycopg2-binary" }, + { name = "pyodbc" }, { name = "pygithub" }, { name = "reportlab" }, { name = "scipy" }, @@ -82,6 +84,7 @@ requires-dist = [ { name = "markdown-it-py", specifier = "==3.0.0" }, { name = "markdown-pdf", specifier = "==1.9" }, { name = "mypy", marker = "extra == 'lint'", specifier = "~=1.15.0" }, + { name = "mysql-connector-python" }, { name = "opentelemetry-exporter-gcp-trace", specifier = "~=1.9.0" }, { name = "pandas", specifier = "==2.3.2" }, { name = "pdfplumber" }, @@ -89,6 +92,7 @@ requires-dist = [ { name = "psycopg2-binary", specifier = ">=2.9.10" }, { name = "pygithub", specifier = "~=2.8.1" }, { name = "pylint", marker = "extra == 'lint'", specifier = ">=4.0.1" }, + { name = "pyodbc" }, { name = "reportlab" }, { name = "ruff", marker = "extra == 'lint'", specifier = ">=0.4.6" }, { name = "scipy", specifier = "~=1.15.0" }, @@ -2138,6 +2142,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963 }, ] +[[package]] +name = "mysql-connector-python" +version = "9.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/33/b332b001bc8c5ee09255a0d4b09a254da674450edd6a3e5228b245ca82a0/mysql_connector_python-9.5.0.tar.gz", hash = "sha256:92fb924285a86d8c146ebd63d94f9eaefa548da7813bc46271508fdc6cc1d596", size = 12251077 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/5d/30210fcf7ba98d1e03de0c47a58218ab5313d82f2e01ae53b47f45c36b9d/mysql_connector_python-9.5.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:77d14c9fde90726de22443e8c5ba0912a4ebb632cc1ade52a349dacbac47b140", size = 17579085 }, + { url = "https://files.pythonhosted.org/packages/77/92/ea79a0875436665330a81e82b4b73a6d52aebcfb1cf4d97f4ad4bd4dedf5/mysql_connector_python-9.5.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:4d603b55de310b9689bb3cb5e57fe97e98756e36d62f8f308f132f2c724f62b8", size = 18445098 }, + { url = "https://files.pythonhosted.org/packages/5f/f2/4578b5093f46985c659035e880e70e8b0bed44d4a59ad4e83df5d49b9c69/mysql_connector_python-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:48ffa71ba748afaae5c45ed9a085a72604368ce611fe81c3fdc146ef60181d51", size = 33660118 }, + { url = "https://files.pythonhosted.org/packages/c5/60/63135610ae0cee1260ce64874c1ddbf08e7fb560c21a3d9cce88b0ddc266/mysql_connector_python-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:77c71df48293d3c08713ff7087cf483804c8abf41a4bb4aefea7317b752c8e9a", size = 34096212 }, + { url = "https://files.pythonhosted.org/packages/3e/b1/78dc693552cfbb45076b3638ca4c402fae52209af8f276370d02d78367a0/mysql_connector_python-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:4f8d2d9d586c34dc9508a44d19cf30ccafabbbd12d7f8ab58da3af118636843c", size = 16512395 }, + { url = "https://files.pythonhosted.org/packages/05/03/77347d58b0027ce93a41858477e08422e498c6ebc24348b1f725ed7a67ae/mysql_connector_python-9.5.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:653e70cd10cf2d18dd828fae58dff5f0f7a5cf7e48e244f2093314dddf84a4b9", size = 17578984 }, + { url = "https://files.pythonhosted.org/packages/a5/bb/0f45c7ee55ebc56d6731a593d85c0e7f25f83af90a094efebfd5be9fe010/mysql_connector_python-9.5.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:5add93f60b3922be71ea31b89bc8a452b876adbb49262561bd559860dae96b3f", size = 18445067 }, + { url = "https://files.pythonhosted.org/packages/1c/ec/054de99d4aa50d851a37edca9039280f7194cc1bfd30aab38f5bd6977ebe/mysql_connector_python-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:20950a5e44896c03e3dc93ceb3a5e9b48c9acae18665ca6e13249b3fe5b96811", size = 33668029 }, + { url = "https://files.pythonhosted.org/packages/90/a2/e6095dc3a7ad5c959fe4a65681db63af131f572e57cdffcc7816bc84e3ad/mysql_connector_python-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:7fdd3205b9242c284019310fa84437f3357b13f598e3f9b5d80d337d4a6406b8", size = 34101687 }, + { url = "https://files.pythonhosted.org/packages/9c/88/bc13c33fca11acaf808bd1809d8602d78f5bb84f7b1e7b1a288c383a14fd/mysql_connector_python-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:c021d8b0830958b28712c70c53b206b4cf4766948dae201ea7ca588a186605e0", size = 16511749 }, + { url = "https://files.pythonhosted.org/packages/02/89/167ebee82f4b01ba7339c241c3cc2518886a2be9f871770a1efa81b940a0/mysql_connector_python-9.5.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a72c2ef9d50b84f3c567c31b3bf30901af740686baa2a4abead5f202e0b7ea61", size = 17581904 }, + { url = "https://files.pythonhosted.org/packages/67/46/630ca969ce10b30fdc605d65dab4a6157556d8cc3b77c724f56c2d83cb79/mysql_connector_python-9.5.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:bd9ba5a946cfd3b3b2688a75135357e862834b0321ed936fd968049be290872b", size = 18448195 }, + { url = "https://files.pythonhosted.org/packages/f6/87/4c421f41ad169d8c9065ad5c46673c7af889a523e4899c1ac1d6bfd37262/mysql_connector_python-9.5.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5ef7accbdf8b5f6ec60d2a1550654b7e27e63bf6f7b04020d5fb4191fb02bc4d", size = 33668638 }, + { url = "https://files.pythonhosted.org/packages/a6/01/67cf210d50bfefbb9224b9a5c465857c1767388dade1004c903c8e22a991/mysql_connector_python-9.5.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6e0a4a0274d15e3d4c892ab93f58f46431222117dba20608178dfb2cc4d5fd8", size = 34102899 }, + { url = "https://files.pythonhosted.org/packages/cd/ef/3d1a67d503fff38cc30e11d111cf28f0976987fb175f47b10d44494e1080/mysql_connector_python-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:b6c69cb37600b7e22f476150034e2afbd53342a175e20aea887f8158fc5e3ff6", size = 16512684 }, + { url = "https://files.pythonhosted.org/packages/72/18/f221aeac49ce94ac119a427afbd51fe1629d48745b571afc0de49647b528/mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:1f5f7346b0d5edb2e994c1bd77b3f5eed88b0ca368ad6788d1012c7e56d7bf68", size = 17581933 }, + { url = "https://files.pythonhosted.org/packages/de/8e/14d44db7353350006a12e46d61c3a995bba06acd7547fc78f9bb32611e0c/mysql_connector_python-9.5.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:07bf52591b4215cb4318b4617c327a6d84c31978c11e3255f01a627bcda2618e", size = 18448446 }, + { url = "https://files.pythonhosted.org/packages/6b/f5/ab306f292a99bff3544ff44ad53661a031dc1a11e5b1ad64b9e5b5290ef9/mysql_connector_python-9.5.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:8972c1f960b30d487f34f9125ec112ea2b3200bd02c53e5e32ee7a43be6d64c1", size = 33668933 }, + { url = "https://files.pythonhosted.org/packages/e8/ee/d146d2642552ebb5811cf551f06aca7da536c80b18fb6c75bdbc29723388/mysql_connector_python-9.5.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f6d32d7aa514d2f6f8709ba1e018314f82ab2acea2e6af30d04c1906fe9171b9", size = 34103214 }, + { url = "https://files.pythonhosted.org/packages/e7/f8/5e88e5eda1fe58f7d146b73744f691d85dce76fb42e7ce3de53e49911da3/mysql_connector_python-9.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:edd47048eb65c196b28aa9d2c0c6a017d8ca084a9a7041cd317301c829eb5a05", size = 16512689 }, + { url = "https://files.pythonhosted.org/packages/95/e1/45373c06781340c7b74fe9b88b85278ac05321889a307eaa5be079a997d4/mysql_connector_python-9.5.0-py2.py3-none-any.whl", hash = "sha256:ace137b88eb6fdafa1e5b2e03ac76ce1b8b1844b3a4af1192a02ae7c1a45bdee", size = 479047 }, +] + [[package]] name = "nbclient" version = "0.10.2" @@ -3134,31 +3167,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491 }, ] -[[package]] -name = "pynacl" -version = "1.6.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591 }, - { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866 }, - { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001 }, - { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024 }, - { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766 }, - { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275 }, - { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891 }, - { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291 }, - { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839 }, - { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371 }, - { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031 }, - { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585 }, - { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923 }, - { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970 }, -] - [[package]] name = "pyparsing" version = "3.2.3" From eb8c2cd2ad135ef0be3f2ac510e144e430298c7c Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Sat, 8 Nov 2025 14:56:05 +0530 Subject: [PATCH 2/8] feat(data-model-discovery-agent): implementation for data_model_discovery_agent along with its tools and sub_agents --- .../data_model_discovery_agent/__init__.py | 1 + .../agent.excalidraw | 1020 +++++++++++++++++ .../data_model_discovery_agent/agent.py | 102 ++ .../data_profiling_agent/__init__.py | 0 .../sub_agents/data_profiling_agent/agent.py | 31 + .../sub_agents/data_profiling_agent/tools.py | 77 ++ .../data_profiling_agent/utils/__init__.py | 0 .../utils/mssql_profiling_utils.py | 108 ++ .../utils/mysql_profiling_utils.py | 105 ++ .../utils/postgres_profiling_utils.py | 106 ++ .../database_cred_agent/__init__.py | 1 + .../sub_agents/database_cred_agent/agent.py | 52 + .../sub_agents/database_cred_agent/tools.py | 147 +++ .../database_introspection_agent/__init__.py | 1 + .../database_introspection_agent/agent.py | 43 + .../sub_agents/postgres_sql_agent/__init__.py | 1 + .../sub_agents/postgres_sql_agent/agent.py | 20 + .../database_introspection_agent/tools.py | 66 ++ .../sub_agents/qa_agent/__init__.py | 1 + .../sub_agents/qa_agent/agent.py | 66 ++ .../sub_agents/reporting_agent/__init__.py | 0 .../sub_agents/reporting_agent/agent.py | 56 + .../sub_agents/reporting_agent/tools.py | 136 +++ .../schema_introspection_agent/__init__.py | 1 + .../schema_introspection_agent/agent.py | 44 + .../schema_introspection_agent/tools.py | 113 ++ .../utils/__init__.py | 0 .../utils/mssql_utils.py | 154 +++ .../utils/mysql_utils.py | 112 ++ .../utils/postgresql_utils.py | 140 +++ 30 files changed, 2704 insertions(+) create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/__init__.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py create mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py new file mode 100644 index 0000000..7dbd57d --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py @@ -0,0 +1 @@ +from .agent import data_model_discovery_agent \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw b/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw new file mode 100644 index 0000000..03436f1 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw @@ -0,0 +1,1020 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "O-PaQXH396tStCF7JKxfI", + "type": "rectangle", + "x": 310.15625, + "y": 221.078125, + "width": 110.26171875, + "height": 88.63671875, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#a5d8ff", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 1469789404, + "version": 74, + "versionNonce": 876272100, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "-GNllmBjF8GoBkCfEE2cx" + }, + { + "id": "PiKGRVvnyp7pjLD1tCi5u", + "type": "arrow" + }, + { + "id": "JU5XimKzjf75Jwi0acyvd", + "type": "arrow" + } + ], + "updated": 1761936287452, + "link": null, + "locked": false + }, + { + "id": "-GNllmBjF8GoBkCfEE2cx", + "type": "text", + "x": 320.28714752197266, + "y": 240.396484375, + "width": 89.99992370605469, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0V", + "roundness": null, + "seed": 351098332, + "version": 17, + "versionNonce": 1692608092, + "isDeleted": false, + "boundElements": [], + "updated": 1761936287452, + "link": null, + "locked": false, + "text": "db\ndiscovery", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "O-PaQXH396tStCF7JKxfI", + "originalText": "db discovery", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "IrO9UaKnLCpnbzqRSCJzy", + "type": "rectangle", + "x": 563.1640625, + "y": 126.328125, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": { + "type": 3 + }, + "seed": 110764636, + "version": 102, + "versionNonce": 1325511004, + "isDeleted": false, + "boundElements": [ + { + "id": "PiKGRVvnyp7pjLD1tCi5u", + "type": "arrow" + }, + { + "type": "text", + "id": "DYEQeZoLdTVJOmIEEHann" + } + ], + "updated": 1761936320962, + "link": null, + "locked": false + }, + { + "id": "DYEQeZoLdTVJOmIEEHann", + "type": "text", + "x": 571.3824615478516, + "y": 153.8046875, + "width": 103.69992065429688, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1V", + "roundness": null, + "seed": 404114148, + "version": 12, + "versionNonce": 1590007012, + "isDeleted": false, + "boundElements": [], + "updated": 1761936163253, + "link": null, + "locked": false, + "text": "cred agent", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "IrO9UaKnLCpnbzqRSCJzy", + "originalText": "cred agent", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "pwSkpY2oOFIyCCIItNnKl", + "type": "rectangle", + "x": 671.876953125, + "y": 167.31640625, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffec99", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 356916452, + "version": 324, + "versionNonce": 901894620, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "nER7F15wfOHOH0UQQSG_n" + } + ], + "updated": 1761936325675, + "link": null, + "locked": false + }, + { + "id": "nER7F15wfOHOH0UQQSG_n", + "type": "text", + "x": 685.5053482055664, + "y": 182.29296875, + "width": 92.87992858886719, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2V", + "roundness": null, + "seed": 1125348444, + "version": 160, + "versionNonce": 2044933724, + "isDeleted": false, + "boundElements": [], + "updated": 1761936325675, + "link": null, + "locked": false, + "text": "validation\nagent", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "pwSkpY2oOFIyCCIItNnKl", + "originalText": "validation\nagent", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "n4vdN0n3RQnDSPjq-nr99", + "type": "rectangle", + "x": 570.306640625, + "y": 349.5625, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": { + "type": 3 + }, + "seed": 1738953820, + "version": 430, + "versionNonce": 22667364, + "isDeleted": false, + "boundElements": [ + { + "id": "JU5XimKzjf75Jwi0acyvd", + "type": "arrow" + }, + { + "id": "siBHscB5llCUfh4O0-BYR", + "type": "arrow" + }, + { + "id": "lQg3kQJ64cAzOQlsm_nnV", + "type": "arrow" + }, + { + "type": "text", + "id": "XUCvYXSXY4SnFXxP9sw4E" + } + ], + "updated": 1761936254839, + "link": null, + "locked": false + }, + { + "id": "XUCvYXSXY4SnFXxP9sw4E", + "type": "text", + "x": 593.5650329589844, + "y": 364.5390625, + "width": 73.61993408203125, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 62680932, + "version": 30, + "versionNonce": 1750968420, + "isDeleted": false, + "boundElements": [], + "updated": 1761936192476, + "link": null, + "locked": false, + "text": "db spec\nagent", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "n4vdN0n3RQnDSPjq-nr99", + "originalText": "db spec\nagent", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "4KwTb3Ksy9iKmaG6rp9rf", + "type": "rectangle", + "x": 774.623046875, + "y": 285.046875, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 705126116, + "version": 180, + "versionNonce": 1194355804, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "L7IdWjNFhHkUSiIxsHLDr" + }, + { + "id": "siBHscB5llCUfh4O0-BYR", + "type": "arrow" + }, + { + "id": "il6TSwE83Z9g89nVNsKKQ", + "type": "arrow" + } + ], + "updated": 1761936257198, + "link": null, + "locked": false + }, + { + "id": "L7IdWjNFhHkUSiIxsHLDr", + "type": "text", + "x": 792.091438293457, + "y": 312.5234375, + "width": 85.19993591308594, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4V", + "roundness": null, + "seed": 1748872412, + "version": 20, + "versionNonce": 1293173988, + "isDeleted": false, + "boundElements": [], + "updated": 1761935917615, + "link": null, + "locked": false, + "text": "postgres", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "4KwTb3Ksy9iKmaG6rp9rf", + "originalText": "postgres", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "bkFb3Nq2lFu--GSYS7px9", + "type": "rectangle", + "x": 781.837890625, + "y": 409.16796875, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#b2f2bb", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a5", + "roundness": { + "type": 3 + }, + "seed": 595614044, + "version": 147, + "versionNonce": 1679980380, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "fu0fiBBjZBALJDnQu7-Lv" + }, + { + "id": "lQg3kQJ64cAzOQlsm_nnV", + "type": "arrow" + }, + { + "id": "qgF1zyHojO8Oq_8ebpZ4V", + "type": "arrow" + } + ], + "updated": 1761936259248, + "link": null, + "locked": false + }, + { + "id": "fu0fiBBjZBALJDnQu7-Lv", + "type": "text", + "x": 816.9062652587891, + "y": 436.64453125, + "width": 49.999969482421875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": null, + "seed": 266223460, + "version": 8, + "versionNonce": 352620132, + "isDeleted": false, + "boundElements": [], + "updated": 1761936258198, + "link": null, + "locked": false, + "text": "mysql", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "bkFb3Nq2lFu--GSYS7px9", + "originalText": "mysql", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "aku4cEBO3VLvxLajPKOX6", + "type": "rectangle", + "x": 1091.208984375, + "y": 306.5546875, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a7", + "roundness": { + "type": 3 + }, + "seed": 1449833948, + "version": 648, + "versionNonce": 444918372, + "isDeleted": false, + "boundElements": [], + "updated": 1761936265152, + "link": null, + "locked": false + }, + { + "id": "BWyzPhPtf4XfBjxGoK1SC", + "type": "rectangle", + "x": 1098.244140625, + "y": 407.28125, + "width": 120.13671875, + "height": 79.953125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 913800420, + "version": 495, + "versionNonce": 1727028060, + "isDeleted": false, + "boundElements": [], + "updated": 1761936269823, + "link": null, + "locked": false + }, + { + "id": "ye_64zqr7CDhyAiWzvOJm", + "type": "rectangle", + "x": 1050.873046875, + "y": 232.08984375, + "width": 201.48046875000006, + "height": 273.68359375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a9", + "roundness": { + "type": 3 + }, + "seed": 356938076, + "version": 696, + "versionNonce": 1666667868, + "isDeleted": false, + "boundElements": [ + { + "id": "il6TSwE83Z9g89nVNsKKQ", + "type": "arrow" + }, + { + "id": "qgF1zyHojO8Oq_8ebpZ4V", + "type": "arrow" + } + ], + "updated": 1761936221713, + "link": null, + "locked": false + }, + { + "id": "ZAleF85iz7usCWNXn2laL", + "type": "text", + "x": 1123.671875, + "y": 255.84375, + "width": 42.25996398925781, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aA", + "roundness": null, + "seed": 1498817892, + "version": 132, + "versionNonce": 551298012, + "isDeleted": false, + "boundElements": [], + "updated": 1761936079075, + "link": null, + "locked": false, + "text": "utils", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "utils", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "5S9Z3gdR-hzsrzK_vw80A", + "type": "text", + "x": 1108.55859375, + "y": 321.95703125, + "width": 95.15994262695312, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": null, + "seed": 336976356, + "version": 117, + "versionNonce": 2027863268, + "isDeleted": false, + "boundElements": [], + "updated": 1761936040603, + "link": null, + "locked": false, + "text": "db result \nto MD", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "db result \nto MD", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "fToOun2xdKMOb2aVkf29p", + "type": "text", + "x": 1120.33203125, + "y": 422.76953125, + "width": 76.0599365234375, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aE", + "roundness": null, + "seed": 585740644, + "version": 85, + "versionNonce": 1959805532, + "isDeleted": false, + "boundElements": [], + "updated": 1761936266468, + "link": null, + "locked": false, + "text": "MD to \nMermaid", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "MD to \nMermaid", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "PiKGRVvnyp7pjLD1tCi5u", + "type": "arrow", + "x": 420.1875, + "y": 224.28125, + "width": 142.8125, + "height": 58.0546875, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aF", + "roundness": { + "type": 2 + }, + "seed": 54532956, + "version": 282, + "versionNonce": 1459463140, + "isDeleted": false, + "boundElements": [], + "updated": 1761936119294, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 142.8125, + -58.0546875 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "O-PaQXH396tStCF7JKxfI", + "focus": -0.2816993651409218, + "gap": 5.510913037302248 + }, + "endBinding": { + "elementId": "IrO9UaKnLCpnbzqRSCJzy", + "focus": 0.38144607016235266, + "gap": 1 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "JU5XimKzjf75Jwi0acyvd", + "type": "arrow", + "x": 423.2396548803663, + "y": 302.0355739110941, + "width": 146.20703125, + "height": 88.73046875, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aG", + "roundness": { + "type": 2 + }, + "seed": 717361372, + "version": 418, + "versionNonce": 808204388, + "isDeleted": false, + "boundElements": [], + "updated": 1761936175410, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 146.20703125, + 88.73046875 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "O-PaQXH396tStCF7JKxfI", + "focus": 0.018883793865720252, + "gap": 5.699811232612768 + }, + "endBinding": { + "elementId": "n4vdN0n3RQnDSPjq-nr99", + "focus": -0.4998404324681728, + "gap": 1 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "siBHscB5llCUfh4O0-BYR", + "type": "arrow", + "x": 694.1420490425966, + "y": 383.9550382997701, + "width": 80.4140625, + "height": 59.02734375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aI", + "roundness": { + "type": 2 + }, + "seed": 665494364, + "version": 609, + "versionNonce": 449713244, + "isDeleted": false, + "boundElements": [], + "updated": 1761936148133, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 80.4140625, + -59.02734375 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "n4vdN0n3RQnDSPjq-nr99", + "focus": 0.4903538633420851, + "gap": 3.698689667596568 + }, + "endBinding": { + "elementId": "4KwTb3Ksy9iKmaG6rp9rf", + "focus": 0.526204109830513, + "gap": 1 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "lQg3kQJ64cAzOQlsm_nnV", + "type": "arrow", + "x": 692.4222629558907, + "y": 405.1802276495595, + "width": 87.06203184912158, + "height": 54.37911917950322, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aJ", + "roundness": { + "type": 2 + }, + "seed": 1585799908, + "version": 782, + "versionNonce": 564994532, + "isDeleted": false, + "boundElements": [], + "updated": 1761936258199, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 87.06203184912158, + 54.37911917950322 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "n4vdN0n3RQnDSPjq-nr99", + "focus": -0.297658746552027, + "gap": 2.391218497804516 + }, + "endBinding": { + "elementId": "bkFb3Nq2lFu--GSYS7px9", + "focus": -0.6375065786945301, + "gap": 2.847062752195484 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "il6TSwE83Z9g89nVNsKKQ", + "type": "arrow", + "x": 896.953125, + "y": 328.69140625, + "width": 152.453125, + "height": 0.7734375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aK", + "roundness": { + "type": 2 + }, + "seed": 1539913948, + "version": 118, + "versionNonce": 1226485596, + "isDeleted": false, + "boundElements": [], + "updated": 1761936230052, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 152.453125, + -0.7734375 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "4KwTb3Ksy9iKmaG6rp9rf", + "focus": 0.09890046545733103, + "gap": 2.193359375 + }, + "endBinding": { + "elementId": "ye_64zqr7CDhyAiWzvOJm", + "focus": 0.302375871013406, + "gap": 1.466796875 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + }, + { + "id": "qgF1zyHojO8Oq_8ebpZ4V", + "type": "arrow", + "x": 905.0233071976285, + "y": 447.6495769914197, + "width": 144.8465371811934, + "height": 0.32010884601004364, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": { + "type": 2 + }, + "seed": 1109247588, + "version": 223, + "versionNonce": 1123501412, + "isDeleted": false, + "boundElements": [], + "updated": 1761936258199, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 144.8465371811934, + -0.32010884601004364 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "bkFb3Nq2lFu--GSYS7px9", + "focus": -0.03381093133160738, + "gap": 3.695205648243359 + }, + "endBinding": { + "elementId": "ye_64zqr7CDhyAiWzvOJm", + "focus": -0.5709895611084019, + "gap": 1.6873931482432454 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py new file mode 100644 index 0000000..600e023 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -0,0 +1,102 @@ +from google.adk.agents.llm_agent import LlmAgent +from google.adk.agents.readonly_context import ReadonlyContext +from .sub_agents.database_cred_agent.agent import database_cred_agent +from .sub_agents.schema_introspection_agent.agent import schema_introspection_agent +from .sub_agents.qa_agent.agent import qa_agent +from .sub_agents.data_profiling_agent.agent import data_profiling_agent +from .sub_agents.reporting_agent.agent import reporting_agent + +import logging +import json + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def root_agent_instruction(ctx: ReadonlyContext) -> str: + """Dynamically builds the Root Agent's instruction based on session state.""" + selected_schema = ctx.state.get("selected_schema") + db_connection = ctx.state.get("db_connection") + available_schemas = ctx.state.get("available_schemas") + schema_structure = ctx.state.get("schema_structure") + data_profile = ctx.state.get("data_profile") + + base_instruction = """ + ## Role + You are the **Root Agent** responsible for coordinating sub-agents to perform database discovery, introspection, profiling, and reporting tasks. + You manage the overall flow, handle user selections, and determine which sub-agent should be called. + + ## Sub-Agent Hierarchy + You have the following sub-agents under your control: + 1. **database_cred_agent**: Collects and validates DB credentials, lists schemas. + 2. **schema_introspection_agent**: Discovers schema details, constraints, and relationships. + 3. **data_profiling_agent**: Analyzes data quality within the selected schema. + 4. **reporting_agent**: Generates summaries, exports data, and creates schema diagrams. + 5. **qa_agent**: Answers questions about the discovered schema and data profile. + --- + """ + + if not db_connection or db_connection.get("status") != "connected": + return base_instruction + """ + **Current Task:** The database is not connected. + - Greet the user and explain your purpose. + - If the user indicates they want to analyze a database, you MUST call the `database_cred_agent` to start the connection process. + Example Response: "Welcome! I'm your Data Discovery Agent. I can help you connect to, understand, profile, and report on your legacy databases. To begin, I need to connect to your database." + User Intent: "I want to analyze my DB" -> Call `database_cred_agent`. + """ + elif available_schemas and not selected_schema: + return base_instruction + """ + **Current Task:** The user has been presented with a list of available schemas by the `database_cred_agent`. Their current input is expected to be the name of the schema they wish to analyze. + + 1. Consider the user's entire input as the desired schema name. + 2. You MUST call the `schema_introspection_agent`. Pass the user's input as the primary query to this sub-agent. The `schema_introspection_agent` is designed to take this input as the schema name for its operations. + - Example AgentTool Call: `schema_introspection_agent(user_input)` + 3. The `schema_introspection_agent` will handle storing the selected schema and fetching the details. Await its response. + """ + elif selected_schema and schema_structure: + profile_status = "Completed" if data_profile else "Not Yet Run" + return base_instruction + f""" + **Current Context:** The database is connected. The schema '{selected_schema}' has been successfully introspected. + Data Quality Profile Status: {profile_status} + + **Task Delegation:** Based on the user's request, delegate to the appropriate sub-agent: + + - **"Profile Data"**, **"Data Quality"**, **"Run profiling"**: + Call `data_profiling_agent`. + - Example: `data_profiling_agent()` + + - **"Generate Report"**, **"Export"**, **"Diagram"**, **"Summary"**, **"ERD"**, **"JSON"**, **"YAML"**, **"Mermaid"**: + Call `reporting_agent` and pass the user's query. + - Example: `reporting_agent(user_input)` + + - **ANY other questions** about the tables, columns, constraints, relationships, views, indexes, anomalies within the '{selected_schema}' schema, or about the data profile results: + Call `qa_agent` and pass the user's question as the query. + - Example: `qa_agent(user_question)` + + If the user's intent is unclear, ask for clarification. You can remind them of the available actions. + """ + elif selected_schema and not schema_structure: + return base_instruction + f""" + **Current Context:** The schema '{selected_schema}' was selected, but the introspection data is missing or incomplete. + - Recall `schema_introspection_agent` and pass the schema name '{selected_schema}' as the input to it to ensure the structure is loaded. + - Example AgentTool Call: `schema_introspection_agent("{selected_schema}")` + """ + else: # Should ideally not be reached if states are managed well + return base_instruction + """ + **Current Task:** Determine the next step based on the conversation history and session state. If unsure, ask the user for clarification. + """ + +data_model_discovery_agent = LlmAgent( + model='gemini-2.5-flash', + name='data_model_discovery_agent', + description=( + "A helpful root agent that orchestrates sub-agents to introspect and profile legacy databases." + ), + instruction=root_agent_instruction, + sub_agents=[ + database_cred_agent, + schema_introspection_agent, + qa_agent, + data_profiling_agent, + reporting_agent, + ] +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py new file mode 100644 index 0000000..d318d76 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py @@ -0,0 +1,31 @@ +from google.adk.agents.llm_agent import LlmAgent +from .tools import profile_schema_data + +data_profiling_agent = LlmAgent( + model='gemini-2.5-flash', + name='data_profiling_agent', + description='Profiles data quality for the selected schema.', + instruction=""" + ### Role + You are a Data Profiling Agent. You analyze the data within the selected schema to identify potential quality issues. + + ### Task + 1. **Invocation:** You will be called by the Root Agent when the user requests data profiling. + 2. **Call Tool:** Invoke the `profile_schema_data` tool. This tool uses the connection details, selected schema, and schema structure from the session state. You can optionally pass a `sample_size` in the args dictionary. + - Example: `profile_schema_data()` or `profile_schema_data(args={"sample_size": 5000})` + 3. **Process Results:** + - If the tool call is successful, it means the profiling is done and results are in the state key `data_profile`. + - Acknowledge completion, mentioning the schema name from the tool result. + "Data profiling for schema '{tool_result.schema_name}' is complete. I've analyzed: + - Column Nullability (for all columns, sampled) + - Column Cardinality (for key columns) + - Orphan Records (for foreign keys, sampled) + - Potential Data Type Anomalies (in text columns like phone/zip, sampled) + + The detailed results are stored. You can now ask questions about the data profile or request a report." + - If the tool returns an error, relay the error message. + """, + tools=[ + profile_schema_data + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py new file mode 100644 index 0000000..7c8c00c --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -0,0 +1,77 @@ +import logging +from typing import Dict, Any +from google.adk.tools import ToolContext +import psycopg2 +import mysql.connector +import pyodbc +from .utils import postgres_profiling_utils, mysql_profiling_utils, mssql_profiling_utils + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: + db_type = metadata.get("db_type") + host = metadata.get("host") + port = int(metadata.get("port")) + dbname = metadata.get("dbname") + user = metadata.get("user") + logger.info(f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}") + if db_type == "postgresql": + return psycopg2.connect(host=host, port=port, dbname=dbname, user=user, password=password) + elif db_type == "mysql": + return mysql.connector.connect(host=host, port=port, database=dbname, user=user, password=password) + elif db_type == "mssql": + conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" + return pyodbc.connect(conn_str) + else: + raise ValueError(f"Unsupported database type: {db_type}") + +async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Profiles the data in the selected schema based on the schema structure. + Calculates nullability, cardinality, orphan records, and type anomalies. + """ + db_conn_state = tool_context.state.get("db_connection") + db_creds = tool_context.state.get("db_creds_temp") + schema_name = tool_context.state.get("selected_schema") + schema_structure = tool_context.state.get("schema_structure") + sample_size = args.get("sample_size", 10000) if args else 10000 + + if not db_conn_state or db_conn_state.get("status") != "connected": return {"error": "DB not connected."} + if not db_creds: return {"error": "DB credentials not found."} + if not schema_name: return {"error": "Selected schema not found."} + if not schema_structure: return {"error": "Schema structure not found. Please run introspection first."} + + metadata = db_conn_state["metadata"] + password = db_creds["password"] + db_type = metadata["db_type"] + + conn = None + try: + conn = _get_db_connection(metadata, password) + logger.info(f"Reconnected to {db_type} for data profiling of schema '{schema_name}'.") + + if db_type == "postgresql": + profile_results = postgres_profiling_utils.profile_postgres_data(conn, schema_name, schema_structure, sample_size) + elif db_type == "mysql": + profile_results = mysql_profiling_utils.profile_mysql_data(conn, schema_name, schema_structure, sample_size) + elif db_type == "mssql": + profile_results = mssql_profiling_utils.profile_mssql_data(conn, schema_name, schema_structure, sample_size) + else: + return {"error": f"Profiling for {db_type} not implemented."} + + tool_context.state["data_profile"] = profile_results + logger.info(f"Data profiling results for '{schema_name}' saved to session state.") + + return { + "status": "success", + "message": f"Data profiling completed for schema '{schema_name}'. Results are stored.", + "schema_name": schema_name, + } + except Exception as e: + logger.error(f"Error during data profiling: {e}", exc_info=True) + return {"error": f"Failed to profile data for {db_type} ({schema_name}): {str(e)}"} + finally: + if conn: + try: conn.close() + except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py new file mode 100644 index 0000000..428c949 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -0,0 +1,108 @@ +import logging +from typing import Dict, Any, List + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [column[0] for column in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, [val for val in row])) for row in rows] + return [] + finally: + cursor.close() + +def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: + profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + full_table_name = f"[{schema_name}].[{table_name}]" + + # Nullability + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT_BIG(*) as total_count, + COUNT_BIG(*) - COUNT([{col_name}]) as null_count + FROM (SELECT TOP {sample_size} [{col_name}] FROM {full_table_name}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 + profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + except Exception as e: + logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + profile_results["nullability"][table_name][col_name] = "Error" + + # Cardinality - PKs, FKs + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f"SELECT COUNT(DISTINCT [{col_name}]) as unique_count FROM {full_table_name};" + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = res['unique_count'] + except Exception as e: + logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + profile_results["cardinality"][table_name][col_name] = "Error" + + # Orphan Records + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + from_full = f"[{schema_name}].[{from_table}]" + to_full = f"[{schema_name}].[{to_table}]" + orphan_q = f""" + SELECT + COUNT_BIG(s.[{from_col}]) as total_fk_values, + SUM(CASE WHEN t.[{to_col}] IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT TOP {sample_size} [{from_col}] FROM {from_full} WHERE [{from_col}] IS NOT NULL) as s + LEFT JOIN {to_full} t ON s.[{from_col}] = t.[{to_col}]; + """ + try: + res = _execute_query(conn, orphan_q)[0] + orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + # Type Anomalies - Heuristic for phone/zip + for table_name, table_info in tables.items(): + full_table_name = f"[{schema_name}].[{table_name}]" + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + anomaly_q = f""" + SELECT COUNT_BIG(*) as non_numeric_count + FROM (SELECT TOP {sample_size} [{col_name}] FROM {full_table_name} WHERE [{col_name}] IS NOT NULL) as s + WHERE [{col_name}] LIKE '%[^0-9.-]%'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + if res['non_numeric_count'] > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + except Exception as e: + logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") + return profile_results \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py new file mode 100644 index 0000000..4f8591f --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -0,0 +1,105 @@ +import logging +from typing import Dict, Any, List + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + cursor = conn.cursor(dictionary=True) + try: + cursor.execute(query) + return cursor.fetchall() + finally: + cursor.close() + +def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: + try: + conn.database = schema_name + except Exception as e: + logger.error(f"Failed to set database {schema_name}: {e}") + raise + + profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + # Nullability + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT(*) as total_count, + SUM(CASE WHEN `{col_name}` IS NULL THEN 1 ELSE 0 END) as null_count + FROM (SELECT `{col_name}` FROM `{table_name}` LIMIT {sample_size}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 + profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + except Exception as e: + logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + profile_results["nullability"][table_name][col_name] = "Error" + + # Cardinality - PKs, FKs + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f"SELECT COUNT(DISTINCT `{col_name}`) as unique_count FROM `{table_name}`;" + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = res['unique_count'] + except Exception as e: + logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + profile_results["cardinality"][table_name][col_name] = "Error" + + # Orphan Records + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + orphan_q = f""" + SELECT + COUNT(s.`{from_col}`) as total_fk_values, + SUM(CASE WHEN t.`{to_col}` IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT `{from_col}` FROM `{from_table}` WHERE `{from_col}` IS NOT NULL LIMIT {sample_size}) as s + LEFT JOIN `{to_table}` t ON s.`{from_col}` = t.`{to_col}`; + """ + try: + res = _execute_query(conn, orphan_q)[0] + orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + # Type Anomalies - Heuristic for phone/zip + for table_name, table_info in tables.items(): + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + anomaly_q = f""" + SELECT COUNT(*) as non_numeric_count + FROM (SELECT `{col_name}` FROM `{table_name}` WHERE `{col_name}` IS NOT NULL LIMIT {sample_size}) as s + WHERE `{col_name}` REGEXP '[^0-9.-]'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + if res['non_numeric_count'] > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + except Exception as e: + logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") + return profile_results \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py new file mode 100644 index 0000000..b6a0756 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -0,0 +1,106 @@ +import logging +from typing import Dict, Any, List + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts.""" + cursor = conn.cursor() + try: + conn.autocommit = True # Ensure no lingering transactions + cursor.execute(query) + if cursor.description: + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row)) for row in rows] + return [] + finally: + cursor.close() + +def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: + profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + tables = schema_structure.get("tables", {}) + + for table_name, table_info in tables.items(): + logger.info(f"Profiling table: {schema_name}.{table_name}") + profile_results["nullability"][table_name] = {} + profile_results["cardinality"][table_name] = {} + # Nullability (AC 4.1) + for col_name in table_info.get("columns", {}): + null_q = f""" + SELECT + COUNT(*) as total_count, + COUNT(*) - COUNT("{col_name}") as null_count + FROM (SELECT "{col_name}" FROM "{schema_name}"."{table_name}" LIMIT {sample_size}) as sampled; + """ + try: + res = _execute_query(conn, null_q)[0] + null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 + profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + except Exception as e: + logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + profile_results["nullability"][table_name][col_name] = "Error" + + # Cardinality (AC 4.2) - PKs, FKs + key_columns = set() + for const in table_info.get("constraints", []): + if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): + key_columns.add(const["columns"]) + for fk in schema_structure.get("foreign_keys", []): + if fk.get("from_table") == table_name and fk.get("from_column"): + key_columns.add(fk["from_column"]) + + for col_name in key_columns: + if col_name in table_info.get("columns", {}): + card_q = f'SELECT COUNT(DISTINCT "{col_name}") as unique_count FROM "{schema_name}"."{table_name}";' + try: + res = _execute_query(conn, card_q)[0] + profile_results["cardinality"][table_name][col_name] = res['unique_count'] + except Exception as e: + logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + profile_results["cardinality"][table_name][col_name] = "Error" + + # Orphan Records (AC 4.3) + for fk in schema_structure.get("foreign_keys", []): + from_table, from_col = fk.get("from_table"), fk.get("from_column") + to_table, to_col = fk.get("to_table"), fk.get("to_column") + if from_table and from_col and to_table and to_col: + fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" + logger.info(f"Checking orphans for {fk_name}") + orphan_q = f""" + SELECT + COUNT(s."{from_col}") as total_fk_values, + SUM(CASE WHEN t."{to_col}" IS NULL THEN 1 ELSE 0 END) as orphan_count + FROM (SELECT "{from_col}" FROM "{schema_name}"."{from_table}" WHERE "{from_col}" IS NOT NULL LIMIT {sample_size}) as s + LEFT JOIN "{schema_name}"."{to_table}" t ON s."{from_col}" = t."{to_col}"; + """ + try: + res = _execute_query(conn, orphan_q)[0] + orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) + except Exception as e: + logger.error(f"Error checking orphans for {fk_name}: {e}") + profile_results["orphan_records"][fk_name] = "Error" + + # Type Anomalies (AC 4.4) - Heuristic for phone/zip + for table_name, table_info in tables.items(): + for col_name, col_info in table_info.get("columns", {}).items(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + anomaly_q = f""" + SELECT COUNT(*) as non_numeric_count + FROM (SELECT "{col_name}" FROM "{schema_name}"."{table_name}" WHERE "{col_name}" IS NOT NULL LIMIT {sample_size}) as s + WHERE "{col_name}" ~ '[^0-9.-]'; + """ + try: + res = _execute_query(conn, anomaly_q)[0] + if res['non_numeric_count'] > 0: + key = f"{table_name}.{col_name}" + if key not in profile_results["type_anomalies"]: + profile_results["type_anomalies"][key] = [] + profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + except Exception as e: + logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") + + return profile_results \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py new file mode 100644 index 0000000..25b1f0b --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py @@ -0,0 +1,52 @@ +from google.adk.agents.llm_agent import LlmAgent +from .tools import validate_db_connection + + +database_cred_agent = LlmAgent( + model='gemini-2.5-flash', + name='database_cred_agent', + description='A helpful assistant that collects and validates database connection details, and lists available schemas.', + instruction=""" + ### Role + You are a helpful and meticulous assistant responsible for collecting database connection details from the user, validating them, and listing the available schemas for selection. + + ### Instructions + 1. **Greeting & Purpose:** Politely inform the user that to proceed with database introspection, you need to establish a connection, which requires a few details. + + 2. **Request Information:** Request the following information from the user: + * **Host:** (e.g., localhost, server.example.com) + * **Port:** (e.g., 5432 for PostgreSQL, 3306 for MySQL, 1433 for MSSQL) + * **Database Name:** (The specific database to connect to) + * **User:** (The username for database authentication) + * **Password:** (The password for database authentication) + * **Database Type:** Clearly state the supported types: "postgresql", "mysql", or "mssql". + + 3. **Ensure Completeness:** Do not proceed to validation until ALL six pieces of information have been provided. + * If any field is missing, politely ask the user specifically for the missing detail(s). + + 4. **Call Validation Tool:** Once all details are collected, you MUST call the `validate_db_connection` tool. Pass all the collected information as a single dictionary argument named `connection_details`. + + 5. **Handle Validation Response:** + * **On Success:** If the `validate_db_connection` tool returns a "success" status: + 1. Acknowledge the successful connection. + 2. Retrieve the list of schemas from the tool's output (`schemas` key). + 3. Present the available schemas to the user. Each schema should be on a new line, prepended with '- '. For example: + "Connection successful! Here are the available schemas: + - schema1 + - schema2 + - schema3" + 4. Ask the user to specify which schema they want to analyze: + "\n\nPlease type the name of the schema you would like to analyze." + 5. Your task ends here. The user's next message will be the schema name. + + * **On Error:** If the tool returns an "error" status, display the error message from the tool to the user and ask if they would like to try again. + + ### Notes + * Always maintain a polite and professional tone. + * You do not know what the user will select. Do not attempt to confirm a selection. + * You do not connect to the database or modify session state yourself; you ONLY collect details, use the `validate_db_connection` tool, and report the results. + """, + tools=[ + validate_db_connection + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py new file mode 100644 index 0000000..2718f38 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -0,0 +1,147 @@ +from google.adk.tools import ToolContext +import logging +from typing import Dict, Any, List + +# Import database connectors +import psycopg2 +import mysql.connector +import pyodbc + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def _get_schemas(conn: Any, db_type: str) -> List[str]: + """Fetches list of schemas/databases based on db type.""" + schemas = [] + cursor = conn.cursor() + try: + if db_type == "postgresql": + cursor.execute( + "SELECT schema_name FROM information_schema.schemata " + "WHERE schema_name NOT LIKE 'pg_%' AND schema_name != 'information_schema';" + ) + schemas = [row[0] for row in cursor.fetchall()] + elif db_type == "mysql": + cursor.execute("SHOW DATABASES;") + # Filter out default mysql databases + default_dbs = {'information_schema', 'mysql', 'performance_schema', 'sys'} + schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_dbs] + elif db_type == "mssql": + cursor.execute("SELECT name FROM sys.schemas;") + # Filter out default mssql schemas + default_schemas = { + 'db_accessadmin', 'db_backupoperator', 'db_datareader', 'db_datawriter', + 'db_ddladmin', 'db_denydatareader', 'db_denydatawriter', 'db_owner', + 'db_securityadmin', 'guest', 'INFORMATION_SCHEMA', 'sys' + } + schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_schemas] + finally: + cursor.close() + return schemas + +async def validate_db_connection(connection_details: Dict[str, Any], tool_context: ToolContext) -> Dict[str, Any]: + """Validates a database connection for PostgreSQL, MySQL, or MSSQL, + fetches available schemas, and saves metadata to session memory. + + Args: + connection_details: Database credentials including host, port, dbname, user, password, + and db_type ("postgresql", "mysql", or "mssql"). + tool_context: The runtime context used to store session-level state. + + Returns: + A dict with: + - status: "success" if connection is valid, else "error". + - message: Details about the validation result. + - schemas: List of schemas (only on success). + """ + safe_log = {k: v for k, v in connection_details.items() if k != "password"} + logger.info(f"Attempting connection with details: {safe_log}") + + required_keys = ["host", "port", "dbname", "user", "password", "db_type"] + missing_keys = [k for k in required_keys if k not in connection_details] + if missing_keys: + error_msg = f"Missing required parameters: {', '.join(missing_keys)}" + logger.error(error_msg) + return {"status": "error", "message": error_msg} + + db_type = connection_details["db_type"].lower() + conn = None + try: + if db_type == "postgresql": + conn = psycopg2.connect( + host=connection_details["host"], + port=connection_details["port"], + dbname=connection_details["dbname"], + user=connection_details["user"], + password=connection_details["password"], + ) + elif db_type == "mysql": + conn = mysql.connector.connect( + host=connection_details["host"], + port=connection_details["port"], + database=connection_details["dbname"], + user=connection_details["user"], + password=connection_details["password"], + ) + elif db_type == "mssql": + conn_str = ( + f"DRIVER={{ODBC Driver 17 for SQL Server}};" + f"SERVER={connection_details['host']},{connection_details['port']};" + f"DATABASE={connection_details['dbname']};" + f"UID={connection_details['user']};" + f"PWD={connection_details['password']}" + ) + conn = pyodbc.connect(conn_str) + else: + error_msg = f"Unsupported database type: {db_type}. Supported types are: postgresql, mysql, mssql." + logger.error(error_msg) + return {"status": "error", "message": error_msg} + + logger.info(f"{db_type.upper()} connection established successfully for validation.") + + # Fetch schemas + schemas = _get_schemas(conn, db_type) + logger.info(f"Successfully fetched schemas: {schemas}") + + # Clear any previous connection state + if "db_connection" in tool_context.state: + del tool_context.state["db_connection"] + if "db_creds_temp" in tool_context.state: + del tool_context.state["db_creds_temp"] + if "selected_schema" in tool_context.state: + del tool_context.state["selected_schema"] + + tool_context.state["db_connection"] = { + "metadata": { + "host": connection_details["host"], + "port": connection_details["port"], + "dbname": connection_details["dbname"], + "user": connection_details["user"], + "db_type": db_type, + }, + "status": "connected", + } + tool_context.state["db_creds_temp"] = {"password": connection_details["password"]} + + logger.info("Connection metadata saved in session memory.") + return { + "status": "success", + "message": f"{db_type.upper()} connection validated successfully.", + "schemas": schemas + } + + except Exception as e: + logger.error(f"Database connection or schema fetch failed for {db_type}: {e}") + if "db_connection" in tool_context.state: + del tool_context.state["db_connection"] + if "db_creds_temp" in tool_context.state: + del tool_context.state["db_creds_temp"] + return {"status": "error", "message": f"Connection/Schema fetch failed for {db_type}: {e}"} + + finally: + if conn: + try: + conn.close() + logger.info(f"{db_type.upper()} connection closed.") + except Exception as e: + logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py new file mode 100644 index 0000000..35e4061 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py @@ -0,0 +1,43 @@ +from google.adk.agents import LlmAgent +from google.adk.tools.agent_tool import AgentTool +from .tools import execute_sql_query +from .sub_agents.postgres_sql_agent.agent import postgres_sql_agent + +database_introspection_agent = LlmAgent( + name="database_introspection_agent", + model='gemini-2.5-flash', + description="Handles database interactions, including generating and executing SQL queries.", + instruction=""" + You are a Database Interaction Agent. Your tasks involve understanding user requests related to database operations, generating the appropriate SQL query using a specialized sub-agent, and executing the query. + + 1. **Understand Request:** Determine what the user wants to do with the database (e.g., select data, count rows, etc.). + + 2. **Check Connection:** Verify that a database connection is active by checking the session state. (You don't need a tool for this, just know it's a prerequisite). + + 3. **Generate SQL:** Use the appropriate sub-agent to generate the SQL query. Currently, only PostgreSQL is supported via `postgres_sql_agent`. + - Invoke `postgres_sql_agent` with the user's natural language request. + + 4. **Execute SQL:** Take the SQL query output from the sub-agent and use the `execute_sql_query` tool to run it against the database. + + 5. **Present Results:** Relay the results or status from the `execute_sql_query` tool back to the user in a clear and understandable way. + - If the result contains data, format it nicely. + - If it's an error, explain the error. + + **Example Flow:** + User: "How many customers do we have?" + You: (Recognize this needs a SQL query) + You: (Call `postgres_sql_agent` with "How many customers do we have?") + `postgres_sql_agent`: "SELECT COUNT(*) FROM customers;" + You: (Take the SQL string) + You: (Call `execute_sql_query` with sql_query="SELECT COUNT(*) FROM customers;") + `execute_sql_query`: Returns success with the count. + You: "There are [count] customers." + + **Constraint:** Only use the `postgres_sql_agent` for generating SQL. + **Note:** The `postgres_sql_agent` is specifically for PostgreSQL databases. + """, + tools=[ + AgentTool(agent=postgres_sql_agent), + execute_sql_query + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py new file mode 100644 index 0000000..32a6655 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py @@ -0,0 +1,20 @@ +from google.adk.agents import LlmAgent + +postgres_sql_agent = LlmAgent( + name="postgres_sql_agent", + model='gemini-2.5-flash', + description="A specialized agent that generates PostgreSQL SQL queries based on natural language requests.", + instruction=""" + You are a PostgreSQL expert. Your task is to generate a single, executable PostgreSQL SQL query based on the user's request. + - Only output the SQL query. + - Do not include any explanations, backticks, or "SQL" markers, just the raw query. + - If the request is ambiguous, ask for clarification, but strive to generate a query if possible. + - Assume standard SQL and PostgreSQL syntax. + + Example Request: "Show me all users from the users table" + Example Output: SELECT * FROM users; + + Example Request: "Find the average age of employees" + Example Output: SELECT AVG(age) FROM employees; + """, +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py new file mode 100644 index 0000000..04d71a5 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py @@ -0,0 +1,66 @@ +import logging +import json +import psycopg2 +from typing import Dict, Any +from google.adk.tools import ToolContext + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +async def execute_sql_query(sql_query: str, tool_context: ToolContext) -> Dict[str, Any]: + """Executes a read-only SQL SELECT query using stored PostgreSQL connection metadata. + + Args: + sql_query: The SQL SELECT statement to execute. + tool_context: Provides session state containing saved database metadata. + + Returns: + A dictionary with: + - result: JSON string containing query results or an error message. + """ + logger.info(f"Running SQL query: {sql_query}") + + # Ensure the query is read-only + if not sql_query.strip().lower().startswith("select"): + logger.warning("Only SELECT queries are allowed.") + return {"result": json.dumps({"error": "Only SELECT queries are allowed."})} + + # Retrieve stored connection metadata + db_conn = tool_context.state.get("db_connection") + if not db_conn or db_conn.get("status") != "connected": + logger.error("No valid database connection found.") + return {"result": json.dumps({"error": "Database not connected or inactive."})} + + metadata = db_conn.get("metadata") + if not metadata: + logger.error("Database metadata missing in session state.") + return {"result": json.dumps({"error": "Missing database metadata."})} + + try: + # Create a temporary connection for query execution + conn = psycopg2.connect( + host=metadata["host"], + port=metadata["port"], + dbname=metadata["dbname"], + user=metadata["user"], + password="postgres", + ) + + # log the connection object + logger.info(f"******* Connection object: {conn}") + + # Execute the query + with conn.cursor() as cursor: + cursor.execute(sql_query) + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + result = [dict(zip(columns, row)) for row in rows] + + conn.close() + logger.info(f"Query executed successfully — rows returned: {len(result)}") + return {"result": json.dumps(result, default=str)} + + except Exception as e: + logger.error(f"SQL execution failed: {e}") + return {"result": json.dumps({"error": str(e)})} \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py new file mode 100644 index 0000000..c2d3633 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py @@ -0,0 +1,66 @@ +from google.adk.agents.llm_agent import LlmAgent +from google.adk.agents.readonly_context import ReadonlyContext +import json + +def qa_agent_instruction(ctx: ReadonlyContext) -> str: + """Dynamically builds the QA agent's instruction, providing the schema structure.""" + schema_structure = ctx.state.get("schema_structure") + selected_schema = ctx.state.get("selected_schema", "the selected schema") + + if not schema_structure: + return f""" + ### Role + You are a Database Schema Q&A Assistant. However, the schema details for '{selected_schema}' are not available. + + ### Task + Inform the user that the schema information is missing and needs to be introspected first. + Example: "I don't have the schema details for '{selected_schema}' yet. Please run the schema discovery/introspection first." + """ + + schema_json = json.dumps(schema_structure, indent=2) + + return f""" + ### Role + You are a Database Schema Q&A Assistant. Your goal is to answer user questions based *only* on the provided database schema structure. + + ### Schema Context for '{selected_schema}' + The following JSON object contains the discovered schema details, including tables, columns, data types, constraints, indexes, views, foreign keys, inferred relationships, and anomalies: + + ```json + {schema_json} + ``` + + ### Instructions + 1. **Analyze the Question:** Carefully understand what information the user is asking for. The question will be the user's input query. + 2. **Consult Schema Context:** Base your answer *exclusively* on the JSON data provided above. Do not infer or assume any information not present. + 3. **Extract Information:** Navigate the JSON structure to find the relevant details. + 4. **Formulate Answer:** Provide a clear, concise answer to the user's question. + * If listing items, use bullet points. + * If describing a table or column, be specific about its properties. + 5. **Handle Missing Information (AC 5.5):** If the user asks about a table, column, or concept not found in the provided JSON, state clearly that the information is not available in the analyzed schema. Example: "The table 'X' was not found in the schema '{selected_schema}'." + + ### Examples of How to Answer: + + * **"List all tables":** Extract keys from the `tables` object. + * **"How many tables are there?":** Count the keys in the `tables` object. + * **"What are the columns in the 'patients' table?":** Look up `tables['patients']['columns']` and list the column names and their types. + * **"Describe the 'email' column in the 'users' table":** Find `tables['users']['columns']['email']` and list all its properties (type, nullable, default, etc.). + * **"What are the constraints on the 'users' table?":** List the items in `tables['users']['constraints']`. + * **"Show me indexes for the 'orders' table":** List items from `tables['orders']['indexes']`. + * **"Are there any views?":** Check if the `views` object has entries. List them if present. + * **"Show me the SQL definition for the view 'active_customers'":** Retrieve the value of `views['active_customers']['definition']`. + * **"List foreign keys for the 'order_items' table":** Filter the `foreign_keys` list where `from_table` is 'order_items'. + * **"Which tables have a foreign key to the 'products' table?":** Filter the `foreign_keys` list where `to_table` is 'products'. + * **"Any inferred relationships for 'user_id'?":** Check the `inferred_relationships` list for entries involving 'user_id'. + * **"Are there any relationship anomalies?":** Report findings from the `anomalies` list. + * **"What is the data type of 'created_at' in 'audits'?":** Get `tables['audits']['columns']['created_at']['type']`. + + Answer truthfully based *only* on the provided JSON data. + """ + +qa_agent = LlmAgent( + model='gemini-2.5-flash', # Or a model better suited for JSON interpretation if needed + name='qa_agent', + description='Answers natural language questions about the discovered database schema structure.', + instruction=qa_agent_instruction, +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py new file mode 100644 index 0000000..3c88285 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py @@ -0,0 +1,56 @@ +from google.adk.agents.llm_agent import LlmAgent +from .tools import generate_summary_report, export_full_report, generate_erd_script + +reporting_agent = LlmAgent( + model='gemini-2.5-flash', + name='reporting_agent', + description='Generates reports, exports data, and creates schema diagrams.', + instruction=""" + ### Role + You are a Reporting Agent. You generate human-readable summaries, export detailed data, and create scripts for schema visualizations based on the analysis performed by other agents. + + ### Context + - You rely on data stored in the session state: + - `selected_schema`: The name of the analyzed schema. + - `schema_structure`: Detailed schema information from introspection. + - `data_profile`: Data quality profiling results. + + ### Tasks + Based on the user's request, call the appropriate tool: + + 1. **Summary Report (AC 5.1):** + - If the user asks for a "summary", "overview", or "high-level report". + - Call: `generate_summary_report()` + - Present the `report_text` from the tool result to the user. + + 2. **Export Full Report (AC 5.2):** + - If the user asks to "export", "get all data", "save report", or specifies a format like "JSON" or "YAML". + - Determine the format. Default to JSON if not specified. + - Call: `export_full_report(args={"format": "json"})` or `export_full_report(args={"format": "yaml"})`. + - Inform the user the report is generated and provide the content within a code block. Example: + "Here is the full report in {tool_result.format} format: + ``` {tool_result.format.lower()} + {tool_result.report_content} + ```" + + 3. **Generate ERD Script (AC 5.3):** + - If the user asks for an "ERD", "diagram", "schema visual", "Mermaid script", or "PlantUML script". + - Currently, only Mermaid is supported. + - Call: `generate_erd_script()` + - Inform the user the script is generated and provide it in a Mermaid code block. Example: + "Here is the {tool_result.script_type} script for the ERD: + ```mermaid + {tool_result.script} + ``` + You can paste this into a {tool_result.script_type} renderer to visualize the schema." + + 4. **Error Handling:** + - If a tool returns an error, relay the error message to the user. + - If required data (like `schema_structure`) is missing, guide the user to run the necessary previous steps (e.g., schema introspection). + """, + tools=[ + generate_summary_report, + export_full_report, + generate_erd_script, + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py new file mode 100644 index 0000000..bcb3ecc --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -0,0 +1,136 @@ +import logging +from typing import Dict, Any, List +from google.adk.tools import ToolContext +import json +import yaml + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any] = None) -> Dict[str, Any]: + """Generates a high-level summary report of the database analysis.""" + schema_structure = tool_context.state.get("schema_structure") + data_profile = tool_context.state.get("data_profile") + selected_schema = tool_context.state.get("selected_schema", "N/A") + + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} + + summary = { + "tables": len(schema_structure.get("tables", {})), + "views": len(schema_structure.get("views", {})), + "explicit_fks": len(schema_structure.get("foreign_keys", [])), + "inferred_relationships": len(schema_structure.get("inferred_relationships", [])), + "schema_anomalies": len(schema_structure.get("anomalies", [])), + "columns": sum(len(t.get("columns", {})) for t in schema_structure.get("tables", {}).values()), + } + + report = f"### Data Discovery Summary for Schema: {selected_schema}\n\n" + report += "**Schema Structure:**\n" + report += f"- Tables Analyzed: {summary['tables']}\n" + report += f"- Total Columns: {summary['columns']}\n" + report += f"- Views Found: {summary['views']}\n" + report += f"- Explicit Foreign Keys: {summary['explicit_fks']}\n" + report += f"- Potential Inferred Relationships: {summary['inferred_relationships']}\n" + report += f"- Schema Anomalies Detected: {summary['schema_anomalies']}\n\n" + + if data_profile: + report += "**Data Quality Profile Highlights:**\n" + null_issues = sum(1 for table in data_profile.get("nullability", {}).values() for null_pct in table.values() if isinstance(null_pct, (int, float)) and null_pct > 50) + orphan_issues = sum(1 for orphan_pct in data_profile.get("orphan_records", {}).values() if isinstance(orphan_pct, (int, float)) and orphan_pct > 10) + type_anomalies = len(data_profile.get("type_anomalies", {})) + + report += f"- Columns with >50% NULLs: {null_issues} (in sampled data)\n" + report += f"- FKs with >10% Orphan Records: {orphan_issues} (in sampled data)\n" + report += f"- Columns with Potential Type Anomalies: {type_anomalies} (in sampled data)\n" + else: + report += "**Data Quality Profile:** Not yet run.\n" + + return {"status": "success", "report_text": report} + +async def export_full_report(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + """Exports the full schema structure and data profile as JSON or YAML.""" + schema_structure = tool_context.state.get("schema_structure") + data_profile = tool_context.state.get("data_profile") + format = args.get("format", "json").lower() + + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} + + full_report_data = { + "schema_structure": schema_structure, + "data_profile": data_profile or "Not run", + } + + try: + if format == "yaml" or format == "yml": + output = yaml.dump(full_report_data, indent=2, sort_keys=False) + file_type = "YAML" + else: # Default to JSON + output = json.dumps(full_report_data, indent=2) + file_type = "JSON" + + return { + "status": "success", + "message": f"Full report generated in {file_type} format. You can copy the content below.", + "report_content": output, + "format": file_type + } + except Exception as e: + logger.error(f"Error generating {format} report: {e}", exc_info=True) + return {"error": f"Failed to generate {format} report: {str(e)}"} + +async def generate_erd_script(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + """Generates a Mermaid script for an Entity Relationship Diagram.""" + schema_structure = tool_context.state.get("schema_structure") + selected_schema = tool_context.state.get("selected_schema", "Schema") + + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} + + tables = schema_structure.get("tables", {}) + fks = schema_structure.get("foreign_keys", []) + inferred = schema_structure.get("inferred_relationships", []) + + mermaid_script = "erDiagram\n" + + # Add entities and attributes + for table_name, table_info in tables.items(): + mermaid_script += f' {table_name} {{\n' + columns = table_info.get("columns", {}) + for col_name, col_info in columns.items(): + col_type = col_info.get("type", "") + constraints = [] + for const in table_info.get("constraints", []): + if const.get("columns") == col_name: + if const.get("type") == "PRIMARY KEY": constraints.append("PK") + if const.get("type") == "UNIQUE": constraints.append("UK") + if not col_info.get("nullable"): constraints.append("NN") + + constraint_str = f" \"{', '.join(constraints)}\"" if constraints else "" + mermaid_script += f' {col_type} {col_name}{constraint_str}\n' + mermaid_script += ' }\n' + + # Add relationships + for fk in fks: + from_table = fk.get("from_table") + to_table = fk.get("to_table") + from_column = fk.get("from_column") + # label = fk.get("constraint_name", "") + mermaid_script += f' {from_table} ||--o{{ {to_table} : "{from_column}"\n' + + # Add inferred relationships + if inferred: + mermaid_script += "\n %% Inferred Relationships\n" + for rel in inferred: + from_table = rel.get("from_table") + to_table = rel.get("to_table") + from_column = rel.get("from_column") + mermaid_script += f' {from_table} ..o{{ {to_table} : "Inferred: {from_column}"\n' + + return { + "status": "success", + "message": "Mermaid ERD script generated. You can render this in a Mermaid renderer.", + "script_type": "Mermaid", + "script": mermaid_script + } \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py new file mode 100644 index 0000000..02c597e --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py new file mode 100644 index 0000000..1e3a32b --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py @@ -0,0 +1,44 @@ +from google.adk.agents.llm_agent import LlmAgent +from google.adk.tools import FunctionTool +from .tools import get_schema_details +import json + +schema_introspection_agent = LlmAgent( + model='gemini-2.5-flash', + name='schema_introspection_agent', + description='Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.', + instruction=""" + ### Role + You are a Database Schema Introspection Agent. Your task is to analyze the structure of a selected database schema. + + ### Task + 1. **Receive Schema Name:** The user's query to this agent (available as the variable `query`) IS the schema name. + 2. **Call Tool:** Invoke `get_schema_details(args={"schema_name": query})`. + + 3. **Process Results:** + - If the tool call returns `status`: "success": + - Extract `schema_name` and `summary` from the tool's result. + - Construct a response to the user, confirming the schema and dynamically summarizing the findings based on the `summary` object. + + - **Response Template:** + "I have successfully introspected the schema '{tool_result.schema_name}'. Here's a summary of what I found: + - **Tables:** {tool_result.summary.tables} (with {tool_result.summary.columns} columns in total) + - **Views:** {tool_result.summary.views} + - **Constraints:** {tool_result.summary.constraints} + - **Indexes:** {tool_result.summary.indexes} + - **Explicit Foreign Keys:** {tool_result.summary.explicit_fks} + - **Potential Inferred Relationships:** {tool_result.summary.inferred_relationships} + - **Relationship Anomalies Detected:** {tool_result.summary.anomalies} + + The full details are stored. What would you like to explore further about the '{tool_result.schema_name}' schema? You can ask things like: + - 'List all tables.' + - 'Describe the table .' + - 'Show foreign keys involving the table.' + - 'Tell me about any anomalies found.'" + + - If the tool call returns an error, relay the error message to the user. + """, + tools=[ + get_schema_details + ], +) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py new file mode 100644 index 0000000..9a52acd --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -0,0 +1,113 @@ +import logging +from typing import Dict, Any, List +from google.adk.tools import ToolContext + +# Import database connectors +import psycopg2 +import mysql.connector +import pyodbc + +# Import utils +from .utils import postgresql_utils, mysql_utils, mssql_utils + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: + db_type = metadata.get("db_type") + host = metadata.get("host") + port = metadata.get("port") + dbname = metadata.get("dbname") + user = metadata.get("user") + + if not all([db_type, host, port, dbname, user, password is not None]): + raise ValueError("Missing one or more required connection parameters in metadata or password.") + port = int(port) + logger.info(f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}") + if db_type == "postgresql": + return psycopg2.connect(host=host, port=port, dbname=dbname, user=user, password=password) + elif db_type == "mysql": + return mysql.connector.connect(host=host, port=port, database=dbname, user=user, password=password) + elif db_type == "mssql": + conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" + return pyodbc.connect(conn_str) + else: + raise ValueError(f"Unsupported database type: {db_type}") + +def _generate_summary(schema_details: Dict[str, Any]) -> Dict[str, int]: + """Generates a summary of the introspected schema structure.""" + summary = { + "tables": len(schema_details.get("tables", {})), + "views": len(schema_details.get("views", {})), + "explicit_fks": len(schema_details.get("foreign_keys", [])), + "inferred_relationships": len(schema_details.get("inferred_relationships", [])), + "anomalies": len(schema_details.get("anomalies", [])), + "columns": 0, + "constraints": 0, + "indexes": 0, + } + for table_info in schema_details.get("tables", {}).values(): + summary["columns"] += len(table_info.get("columns", {})) + summary["constraints"] += len(table_info.get("constraints", [])) + summary["indexes"] += len(table_info.get("indexes", [])) + return summary + +async def get_schema_details(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + """ + Retrieves detailed schema information and a summary for the given schema_name. + Updates the session state with the selected_schema and schema_structure. + """ + schema_name = args.get("schema_name") + if not schema_name or not str(schema_name).strip(): + return {"error": "schema_name not provided in args or is empty."} + schema_name = str(schema_name).strip() + + # ... (state checks as before) ... + db_conn_state = tool_context.state.get("db_connection") + db_creds = tool_context.state.get("db_creds_temp") + + if not db_conn_state or db_conn_state.get("status") != "connected": + return {"error": "Database not connected. Please connect first."} + if not db_creds: + return {"error": "Database credentials not found."} + + tool_context.state["selected_schema"] = schema_name + if "available_schemas" in tool_context.state: + del tool_context.state["available_schemas"] + + metadata = db_conn_state["metadata"] + password = db_creds["password"] + db_type = metadata["db_type"] + + conn = None + try: + conn = _get_db_connection(metadata, password) + logger.info(f"Successfully reconnected to {db_type} for introspection of schema '{schema_name}'.") + + if db_type == "postgresql": + schema_details = postgresql_utils.get_postgres_schema_details(conn, schema_name) + elif db_type == "mysql": + schema_details = mysql_utils.get_mysql_schema_details(conn, schema_name) + elif db_type == "mssql": + schema_details = mssql_utils.get_mssql_schema_details(conn, schema_name) + else: + return {"error": f"Introspection for {db_type} is not implemented."} + + tool_context.state["schema_structure"] = schema_details + logger.info(f"Schema structure for '{schema_name}' saved to session state.") + + summary = _generate_summary(schema_details) + + return { + "status": "success", + "message": f"Schema details for '{schema_name}' ({db_type}) retrieved and stored.", + "schema_name": schema_name, + "summary": summary # Include the summary + } + except Exception as e: + logger.error(f"Error during schema introspection: {e}", exc_info=True) + return {"error": f"Failed to get schema details for {db_type} ({schema_name}): {str(e)}"} + finally: + if conn: + try: conn.close() + except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py new file mode 100644 index 0000000..1a784b5 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -0,0 +1,154 @@ +import logging +from typing import Dict, Any, List +import pyodbc + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts.""" + cursor = conn.cursor() + try: + cursor.execute(query) + columns = [column[0] for column in cursor.description] + results = [] + for row in cursor.fetchall(): + results.append(dict(zip(columns, row))) + return results + finally: + cursor.close() + +def get_mssql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: + logger.info(f"Fetching MSSQL schema details for: {schema_name}") + details = {"tables": {}, "views": {}, "foreign_keys": []} + + # Tables + tables_query = f""" + SELECT TABLE_NAME + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE'; + """ + tables = _execute_query(conn, tables_query) + + for table in tables: + t_name = table['TABLE_NAME'] + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + + # Columns + cols_query = f""" + SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT + FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}'; + """ + columns = _execute_query(conn, cols_query) + for col in columns: + details["tables"][t_name]["columns"][col['COLUMN_NAME']] = { + "type": col['DATA_TYPE'], + "length": col['CHARACTER_MAXIMUM_LENGTH'], + "precision": col['NUMERIC_PRECISION'], + "scale": col['NUMERIC_SCALE'], + "nullable": col['IS_NULLABLE'] == 'YES', + "default": col['COLUMN_DEFAULT'], + } + + # Constraints (PK, UNIQUE, CHECK) + constraints_query = f""" + SELECT + KCU.TABLE_NAME, + TC.CONSTRAINT_NAME, + TC.CONSTRAINT_TYPE, + KCU.COLUMN_NAME, + CC.CHECK_CLAUSE + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC + ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA + WHERE TC.TABLE_SCHEMA = '{schema_name}'; + """ + constraints = _execute_query(conn, constraints_query) + for const in constraints: + t_name = const['TABLE_NAME'] + if t_name in details["tables"]: + details["tables"][t_name]["constraints"].append({ + "name": const['CONSTRAINT_NAME'], + "type": const['CONSTRAINT_TYPE'], + "columns": const['COLUMN_NAME'], + "check_clause": const['CHECK_CLAUSE'], + }) + + # Indexes + indexes_query = f""" + SELECT + t.name AS table_name, + ind.name AS index_name, + COL_NAME(ic.object_id, ic.column_id) AS column_name, + ind.is_unique + FROM sys.indexes ind + INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id + INNER JOIN sys.tables t ON ind.object_id = t.object_id + INNER JOIN sys.schemas s ON t.schema_id = s.schema_id + WHERE s.name = '{schema_name}' AND ind.is_hypothetical = 0 AND ind.is_primary_key = 0 AND ind.type > 0 + ORDER BY t.name, ind.name, ic.key_ordinal; + """ + try: + indexes = _execute_query(conn, indexes_query) + for index in indexes: + t_name = index['table_name'] + if t_name in details["tables"]: + idx_name = index['index_name'] + if not idx_name: continue + found = False + for existing_idx in details["tables"][t_name]["indexes"]: + if existing_idx["name"] == idx_name: + if index['column_name'] not in existing_idx["columns"]: + existing_idx["columns"].append(index['column_name']) + found = True + break + if not found: + details["tables"][t_name]["indexes"].append({ + "name": idx_name, + "columns": [index['column_name']], + "unique": index['is_unique'] + }) + except Exception as e: + logger.error(f"Error fetching MSSQL indexes: {e}") + + # Foreign Keys + fks_query = f""" + SELECT + KCU1.CONSTRAINT_NAME AS fk_constraint_name + ,KCU1.TABLE_SCHEMA AS from_schema + ,KCU1.TABLE_NAME AS from_table + ,KCU1.COLUMN_NAME AS from_column + ,KCU2.TABLE_SCHEMA AS to_schema + ,KCU2.TABLE_NAME AS to_table + ,KCU2.COLUMN_NAME AS to_column + FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS RC + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 + ON KCU1.CONSTRAINT_CATALOG = RC.CONSTRAINT_CATALOG + AND KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA + AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 + ON KCU2.CONSTRAINT_CATALOG = RC.UNIQUE_CONSTRAINT_CATALOG + AND KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA + AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME + AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION + WHERE KCU1.TABLE_SCHEMA = '{schema_name}'; + """ + try: + details["foreign_keys"] = _execute_query(conn, fks_query) + except Exception as e: + logger.error(f"Error fetching MSSQL foreign keys: {e}") + details["foreign_keys"] = [{"error": str(e)}] + + # Views + views_query = f""" + SELECT TABLE_NAME AS view_name, VIEW_DEFINITION + FROM INFORMATION_SCHEMA.VIEWS + WHERE TABLE_SCHEMA = '{schema_name}'; + """ + views = _execute_query(conn, views_query) + for view in views: + details["views"][view['view_name']] = {"definition": view['VIEW_DEFINITION']} + + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py new file mode 100644 index 0000000..3f0bff8 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -0,0 +1,112 @@ +import logging +from typing import Dict, Any, List +import mysql.connector + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts.""" + cursor = conn.cursor(dictionary=True) + try: + cursor.execute(query) + return cursor.fetchall() + finally: + cursor.close() + +def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: + # For MySQL, schema_name is the database name. + logger.info(f"Fetching MySQL schema details for: {schema_name}") + try: + conn.database = schema_name + except mysql.connector.Error as err: + logger.error(f"MySQL change database failed: {err}") + raise + + details = {"tables": {}, "views": {}, "foreign_keys": []} + + # Tables + tables_query = "SHOW FULL TABLES WHERE Table_type = 'BASE TABLE';" + tables = _execute_query(conn, tables_query) + table_names = [list(t.values())[0] for t in tables] + + for t_name in table_names: + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + + # Columns + cols_query = f"DESCRIBE `{t_name}`;" + columns = _execute_query(conn, cols_query) + for col in columns: + details["tables"][t_name]["columns"][col['Field']] = { + "type": col['Type'], + "nullable": col['Null'] == 'YES', + "default": col['Default'], + "key": col['Key'], # PRI, UNI, MUL + "extra": col['Extra'], + } + + # Constraints (PK, UNIQUE) + constraints_query = f""" + SELECT + KCU.CONSTRAINT_NAME, + TC.CONSTRAINT_TYPE, + KCU.COLUMN_NAME + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME + AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA + AND TC.TABLE_NAME = KCU.TABLE_NAME + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND TC.TABLE_NAME = '{t_name}' + AND TC.CONSTRAINT_TYPE IN ('PRIMARY KEY', 'UNIQUE'); + """ + constraints = _execute_query(conn, constraints_query) + for const in constraints: + details["tables"][t_name]["constraints"].append({ + "name": const['CONSTRAINT_NAME'], + "type": const['CONSTRAINT_TYPE'], + "columns": const['COLUMN_NAME'], + }) + # Note: MySQL CHECK constraints are in information_schema.CHECK_CONSTRAINTS + + # Indexes + indexes_query = f"SHOW INDEX FROM `{t_name}`;" + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + idx_name = index['Key_name'] + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index['Non_unique'] == 0 + } + grouped_indexes[idx_name]["columns"].append(index['Column_name']) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + + # Foreign Keys + fks_query = f""" + SELECT + KCU.TABLE_NAME AS from_table, + KCU.COLUMN_NAME AS from_column, + KCU.REFERENCED_TABLE_NAME AS to_table, + KCU.REFERENCED_COLUMN_NAME AS to_column, + KCU.CONSTRAINT_NAME + FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + WHERE KCU.TABLE_SCHEMA = '{schema_name}' + AND KCU.REFERENCED_TABLE_NAME IS NOT NULL; + """ + details["foreign_keys"] = _execute_query(conn, fks_query) + + # Views + views_query = "SHOW FULL TABLES WHERE Table_type = 'VIEW';" + views = _execute_query(conn, views_query) + view_names = [list(v.values())[0] for v in views] + for v_name in view_names: + try: + definition_query = f"SHOW CREATE VIEW `{v_name}`;" + definition = _execute_query(conn, definition_query) + details["views"][v_name] = {"definition": definition[0]['Create View']} + except Exception as e: + logger.warning(f"Could not fetch view definition for {v_name}: {e}") + details["views"][v_name] = {"definition": "N/A"} + + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py new file mode 100644 index 0000000..6df5c77 --- /dev/null +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -0,0 +1,140 @@ +import logging +from typing import Dict, Any, List + +logger = logging.getLogger(__name__) + +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts.""" + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row)) for row in rows] + return [] + finally: + cursor.close() + +def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: + details = {"tables": {}, "views": {}, "foreign_keys": []} + logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") + + # Tables and Columns + tables_query = f""" + SELECT table_name + FROM information_schema.tables + WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'; + """ + tables = _execute_query(conn, tables_query) + + for table in tables: + t_name = table['table_name'] + details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} + + cols_query = f""" + SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default + FROM information_schema.columns + WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; + """ + columns = _execute_query(conn, cols_query) + for col in columns: + details["tables"][t_name]["columns"][col['column_name']] = { + "type": col['data_type'], + "length": col['character_maximum_length'], + "precision": col['numeric_precision'], + "scale": col['numeric_scale'], + "nullable": col['is_nullable'] == 'YES', + "default": col['column_default'], + } + + # Constraints (PK, UNIQUE, CHECK) + constraints_query = f""" + SELECT + tc.table_name, + tc.constraint_name, + tc.constraint_type, + kcu.column_name, + cc.check_clause + FROM information_schema.table_constraints tc + LEFT JOIN information_schema.key_column_usage kcu + ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name + LEFT JOIN information_schema.check_constraints cc + ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema + WHERE tc.table_schema = '{schema_name}' AND tc.table_name IN (SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'); + """ + constraints = _execute_query(conn, constraints_query) + for const in constraints: + t_name = const['table_name'] + if t_name in details["tables"]: + details["tables"][t_name]["constraints"].append({ + "name": const['constraint_name'], + "type": const['constraint_type'], + "columns": const['column_name'], + "check_clause": const['check_clause'], + }) + + # Indexes + indexes_query = f""" + SELECT + t.relname AS table_name, + i.relname AS index_name, + a.attname AS column_name, + ix.indisunique AS is_unique + FROM pg_class t + JOIN pg_index ix ON t.oid = ix.indrelid + JOIN pg_class i ON i.oid = ix.indexrelid + LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) + JOIN pg_namespace n ON t.relnamespace = n.oid + WHERE n.nspname = '{schema_name}' AND t.relkind = 'r'; + """ + try: + indexes = _execute_query(conn, indexes_query) + for index in indexes: + t_name = index['table_name'] + if t_name in details["tables"] and index['column_name']: + idx_name = index['index_name'] + found = False + for existing_idx in details["tables"][t_name]["indexes"]: + if existing_idx["name"] == idx_name: + if index['column_name'] not in existing_idx["columns"]: + existing_idx["columns"].append(index['column_name']) + found = True + break + if not found: + details["tables"][t_name]["indexes"].append({ + "name": idx_name, + "columns": [index['column_name']], + "unique": index['is_unique'] + }) + except Exception as e: + logger.error(f"Error fetching PostgreSQL indexes: {e}") + + # Foreign Keys + fks_query = f""" + SELECT + tc.table_name AS from_table, + kcu.column_name AS from_column, + ccu.table_name AS to_table, + ccu.column_name AS to_column, + tc.constraint_name + FROM information_schema.table_constraints AS tc + JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema + WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = '{schema_name}'; + """ + details["foreign_keys"] = _execute_query(conn, fks_query) + + # Views + views_query = f""" + SELECT table_name AS view_name, view_definition + FROM information_schema.views + WHERE table_schema = '{schema_name}'; + """ + views = _execute_query(conn, views_query) + for view in views: + details["views"][view['view_name']] = {"definition": view['view_definition']} + + return details From ad1a5633ae66bb81fb39128610723bf7e58f5166 Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Fri, 14 Nov 2025 15:13:17 +0530 Subject: [PATCH 3/8] feat(data-model-discovery-agent): refactor the sub_agents for improvements fix conflicts --- .../data_model_discovery_agent/agent.py | 121 ++++++- .../sub_agents/data_profiling_agent/agent.py | 55 ++- .../sub_agents/data_profiling_agent/tools.py | 12 +- .../utils/mysql_profiling_utils.py | 2 +- .../sub_agents/database_cred_agent/agent.py | 83 +++-- .../sub_agents/database_cred_agent/tools.py | 8 - .../sub_agents/qa_agent/agent.py | 100 ++++-- .../sub_agents/reporting_agent/agent.py | 35 +- .../sub_agents/reporting_agent/tools.py | 254 +++++++++++--- .../schema_introspection_agent/agent.py | 50 ++- .../schema_introspection_agent/tools.py | 1 - .../utils/mssql_utils.py | 327 +++++++++++------- .../utils/mysql_utils.py | 268 +++++++++++--- .../utils/postgresql_utils.py | 297 ++++++++++------ 14 files changed, 1150 insertions(+), 463 deletions(-) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py index 600e023..c11b977 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -25,23 +25,120 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: You are the **Root Agent** responsible for coordinating sub-agents to perform database discovery, introspection, profiling, and reporting tasks. You manage the overall flow, handle user selections, and determine which sub-agent should be called. - ## Sub-Agent Hierarchy - You have the following sub-agents under your control: - 1. **database_cred_agent**: Collects and validates DB credentials, lists schemas. - 2. **schema_introspection_agent**: Discovers schema details, constraints, and relationships. - 3. **data_profiling_agent**: Analyzes data quality within the selected schema. - 4. **reporting_agent**: Generates summaries, exports data, and creates schema diagrams. - 5. **qa_agent**: Answers questions about the discovered schema and data profile. + ## Your Capabilities + - Explore tables, columns, and relationships in a database schema + - Check data quality and highlight issues like missing or duplicate values + - Generate reports and visual diagrams of your database schema + - Answer questions about your data and schema structure + + ### Sub-Agent Roles, Scope, and Boundaries + + Here is a definition of the roles, responsibilities, scope, and boundaries for each sub-agent you control: + + 1. **`database_cred_agent`**: + * **Scope:** Initial Database Connection and Schema Listing. + * **Responsibilities:** + * Politely interact with the user to collect all necessary database connection parameters: Host, Port, Database Name, User, Password, and Database Type (PostgreSQL, MySQL, MSSQL). + * Ensure all required fields are provided before proceeding. + * Call the `validate_db_connection` tool to verify the credentials and establish a test connection. + * Upon successful validation, retrieve and display the list of available schemas within the connected database to the user, formatted as a raw Markdown list. + * Store connection metadata and available schemas in the session state. + * **Boundaries:** + * Does **not** select a schema for the user; it only presents the list. + * Does **not** perform any schema introspection beyond listing schema names. + * Does **not** handle any tasks related to data profiling, reporting, or Q&A. + * Does **not** persist credentials beyond the current session's needs. + * Your task ends after presenting the schema list and prompting the user to choose. + + 2. **`schema_introspection_agent`**: + * **Scope:** Deep Schema Analysis. + * **Responsibilities:** + * Takes a single `schema_name` as input (this will be the user's query to this agent). + * Calls the `get_schema_details` tool, passing the input schema name in the `args` dictionary (e.g., `get_schema_details(args={"schema_name": query})`). The tool uses the stored connection to: + * Discover all tables and views. + * Detail columns for each table: names, data types, lengths, precision, nullability, defaults. + * Identify all constraints: PRIMARY KEY, UNIQUE, FOREIGN KEY, CHECK, NOT NULL. + * Discover all indexes, including columns and uniqueness. + * Capture view definitions. + * Identify explicit and potential inferred relationships. + * Flag relationship anomalies. + * The tool stores the comprehensive `schema_structure` object in the session state. + * Provides a brief summary of findings back to the Root Agent as a tool result. + * **Boundaries:** + * Does **not** connect to the database itself; relies on session state connection info. + * Does **not** profile the actual data within the tables. + * Does **not** generate user-facing reports or diagrams. + * Does **not** answer any follow-up questions about the schema details; this is the `qa_agent`'s role. If asked, state your task is complete. + + 3. **`data_profiling_agent`**: + * **Scope:** Data Quality Analysis. + * **Responsibilities:** + * Uses the `selected_schema` and `schema_structure` from the session state. + * Calls the `profile_schema_data` tool to execute queries against the database (using sampling) to perform EPIC 4 tasks. + * The tool stores the `data_profile` results in the session state. + * Upon successful tool completion, this agent's *only* next action is to call the `qa_agent` to summarize the profiling results for the user in the same turn, using an `AgentTool` call: `qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.")`. + * **Boundaries:** + * Does **not** perform schema introspection. + * Does **not** generate formatted reports. + * Does **not** directly respond to the user; it delegates the response to the `qa_agent`. + + 4. **`reporting_agent`**: + * **Scope:** Output Generation. + * **Responsibilities:** + * Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state. + * Based on the user's query to this agent: + * Generates a high-level summary report using `generate_summary_report(args={})`. + * Exports the full discovery report as JSON `export_full_report(args={"format": "..."})`. + * Generates Mermaid ERD scripts using `generate_erd_script(args={})`. + * Returns the generated report or script content. + * **Boundaries:** + * Does **not** connect to the database or run any new analysis. + * Does **not** handle interactive Q&A. + + 5. **`qa_agent`**: + * **Scope:** Answering User Questions about Schema and Data Profile. + * **Responsibilities:** + * Reads `selected_schema`, `schema_structure`, and `data_profile` from the session state. + * Answers natural language questions from the user about any data contained within the state objects. + * Can provide a summary of Data Profiling results when prompted. + * Formats answers clearly, using Markdown tables where appropriate, as per its internal instructions. + * **Boundaries:** + * Does **not** connect to the database. + * Does **not** perform any new introspection or profiling. + * Does **not** generate file exports or full reports. --- """ if not db_connection or db_connection.get("status") != "connected": return base_instruction + """ - **Current Task:** The database is not connected. - - Greet the user and explain your purpose. - - If the user indicates they want to analyze a database, you MUST call the `database_cred_agent` to start the connection process. - Example Response: "Welcome! I'm your Data Discovery Agent. I can help you connect to, understand, profile, and report on your legacy databases. To begin, I need to connect to your database." - User Intent: "I want to analyze my DB" -> Call `database_cred_agent`. + **Current State:** No active database connection. + + **Your Task:** + 1. **Analyze the User's Query:** Determine the user's intent. + 2. **Database-Related Intent:** If the user's query suggests they want to perform any database operations (e.g., mentioning "database", "connect", "schema", "table", "analyze", "SQL", "postgres", "mysql", "mssql", "ERD", "report on DB", etc.), you MUST immediately call the `database_cred_agent` to initiate the connection process. Do not attempt to answer further. + - Example User Intents: "Analyze my database", "Connect to a database", "I want to see my tables". + - **Action:** Call `database_cred_agent()` + + 3. **General Conversation / Capability Inquiry:** If the user's query is a greeting ("Hi"), asks about your capabilities ("What can you do?"), or is general chat not related to database actions: + - Respond politely. + - Briefly explain your purpose: "I am a Data Discovery Agent designed to help you connect to, understand, profile, and report on your legacy databases (PostgreSQL, MySQL, MSSQL)." + - List your high-level capabilities: + * Securely connect to databases. + * Discover schemas, tables, columns, constraints, and relationships. + * Profile data quality (nulls, cardinality, orphans, etc.). + * Generate reports (Summaries, JSON, Mermaid script for ERD diagrams). + * Answer questions about the discovered schema and data profile. + - Crucially, state that to use these features, you'll need to connect to their database first. Example: "To get started with any of these actions, I'll need the connection details for your database. Let me know when you're ready to connect!" + - Do NOT call any sub-agents in this case. Await the user's next response. + + **Example Flow (No DB Intent):** + User: "Hello, what can you do?" + You: "Hi! I am a Data Discovery Agent... I can help you connect to databases + - Explore tables, columns, and relationships in a database schema + - Check data quality and highlight issues like missing or duplicate values + - Generate reports and visual diagrams of your database schema + - Answer questions about your data and schema structure + To do any of this, I'll first need to connect to your database. Just let me know when you want to proceed!" """ elif available_schemas and not selected_schema: return base_instruction + """ diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py index d318d76..530743c 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py @@ -1,31 +1,52 @@ from google.adk.agents.llm_agent import LlmAgent from .tools import profile_schema_data +from ..qa_agent.agent import qa_agent data_profiling_agent = LlmAgent( model='gemini-2.5-flash', name='data_profiling_agent', - description='Profiles data quality for the selected schema.', + description='Profiles data quality for the selected schema and then calls QA agent to summarize.', instruction=""" ### Role - You are a Data Profiling Agent. You analyze the data within the selected schema to identify potential quality issues. + You are a **Data Profiling Agent**. Your sole responsibility is to run data profiling on a schema and then immediately hand off the summary of findings to the QA agent for user-facing reporting. - ### Task - 1. **Invocation:** You will be called by the Root Agent when the user requests data profiling. - 2. **Call Tool:** Invoke the `profile_schema_data` tool. This tool uses the connection details, selected schema, and schema structure from the session state. You can optionally pass a `sample_size` in the args dictionary. - - Example: `profile_schema_data()` or `profile_schema_data(args={"sample_size": 5000})` - 3. **Process Results:** - - If the tool call is successful, it means the profiling is done and results are in the state key `data_profile`. - - Acknowledge completion, mentioning the schema name from the tool result. - "Data profiling for schema '{tool_result.schema_name}' is complete. I've analyzed: - - Column Nullability (for all columns, sampled) - - Column Cardinality (for key columns) - - Orphan Records (for foreign keys, sampled) - - Potential Data Type Anomalies (in text columns like phone/zip, sampled) + ### Scope + - You ONLY execute profiling tasks and hand off the summary to the QA agent. + - Do NOT attempt to answer user questions directly. + - Profiling includes only schema-level data statistics (column nullability, cardinality, orphan records, data type anomalies). - The detailed results are stored. You can now ask questions about the data profile or request a report." - - If the tool returns an error, relay the error message. + ### Profiling Tasks + 1. **Column Nullability:** For each column, calculate and report the percentage of NULL values based on a representative sample (e.g., top 10,000 rows). + 2. **Column Cardinality:** For key columns (PKs, FKs, inferred keys), report the cardinality (count of unique values). + 3. **Orphan Record Detection:** Sample FK columns and report the percentage of orphan records (e.g., orders.customer_id values missing in customers.id). + 4. **Data Type Anomalies:** For text-based columns (VARCHAR, CHAR), detect potential type inconsistencies (e.g., customer_phone containing non-numeric characters). + + ### Task Execution + 1. **Receive Input:** The user's query or relevant arguments (e.g., `sample_size`) are available in `query`. + + 2. **Call Profiling Tool:** Invoke `profile_schema_data` with the arguments: + ```python + profile_schema_data(args=query if isinstance(query, dict) else {}) + ``` + 3. **Process Profiling Results:** + - If `status` is `"success"`: + - Store profiling results in the session state. + - **Do NOT return results directly to the user.** + - Immediately invoke the QA agent to summarize the findings: + ```python + qa_agent(query="Data profiling just completed. Please summarize the key findings from the new data profile.") + ``` + - If the tool call fails, return a human-readable error dictionary: + ```json + {"error": "Failed to profile data: "} + ``` + + ### Important + - Your execution ends after handing off to the QA agent. + - Do not provide analysis, interpretation, or answers outside the profiling scope. + - Forward all user-facing summaries and questions to the QA agent. """, tools=[ - profile_schema_data + profile_schema_data, ], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py index 7c8c00c..d91675c 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -26,16 +26,20 @@ def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: else: raise ValueError(f"Unsupported database type: {db_type}") -async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any] = None) -> Dict[str, Any]: +async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: """ Profiles the data in the selected schema based on the schema structure. Calculates nullability, cardinality, orphan records, and type anomalies. + Sets a flag on successful completion. """ + if not isinstance(args, dict): + return {"error": "Invalid arguments. Expected a dictionary for args."} + db_conn_state = tool_context.state.get("db_connection") db_creds = tool_context.state.get("db_creds_temp") schema_name = tool_context.state.get("selected_schema") schema_structure = tool_context.state.get("schema_structure") - sample_size = args.get("sample_size", 10000) if args else 10000 + sample_size = args.get("sample_size", 10000) if not db_conn_state or db_conn_state.get("status") != "connected": return {"error": "DB not connected."} if not db_creds: return {"error": "DB credentials not found."} @@ -61,6 +65,7 @@ async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any] = return {"error": f"Profiling for {db_type} not implemented."} tool_context.state["data_profile"] = profile_results + tool_context.state["profiling_just_completed"] = True # Set the flag logger.info(f"Data profiling results for '{schema_name}' saved to session state.") return { @@ -74,4 +79,5 @@ async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any] = finally: if conn: try: conn.close() - except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") \ No newline at end of file + except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") + \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py index 4f8591f..c251a95 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -102,4 +102,4 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") except Exception as e: logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") - return profile_results \ No newline at end of file + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py index 25b1f0b..ab0a05e 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py @@ -8,43 +8,66 @@ description='A helpful assistant that collects and validates database connection details, and lists available schemas.', instruction=""" ### Role - You are a helpful and meticulous assistant responsible for collecting database connection details from the user, validating them, and listing the available schemas for selection. + You are a helpful assistant responsible for gathering, validating, and confirming database connection details from the user, then listing the available schemas for selection. Your responses containing lists of schemas MUST be in raw Markdown format. + + --- ### Instructions - 1. **Greeting & Purpose:** Politely inform the user that to proceed with database introspection, you need to establish a connection, which requires a few details. - - 2. **Request Information:** Request the following information from the user: - * **Host:** (e.g., localhost, server.example.com) - * **Port:** (e.g., 5432 for PostgreSQL, 3306 for MySQL, 1433 for MSSQL) - * **Database Name:** (The specific database to connect to) - * **User:** (The username for database authentication) - * **Password:** (The password for database authentication) - * **Database Type:** Clearly state the supported types: "postgresql", "mysql", or "mssql". - - 3. **Ensure Completeness:** Do not proceed to validation until ALL six pieces of information have been provided. - * If any field is missing, politely ask the user specifically for the missing detail(s). - - 4. **Call Validation Tool:** Once all details are collected, you MUST call the `validate_db_connection` tool. Pass all the collected information as a single dictionary argument named `connection_details`. - - 5. **Handle Validation Response:** - * **On Success:** If the `validate_db_connection` tool returns a "success" status: - 1. Acknowledge the successful connection. - 2. Retrieve the list of schemas from the tool's output (`schemas` key). - 3. Present the available schemas to the user. Each schema should be on a new line, prepended with '- '. For example: - "Connection successful! Here are the available schemas: + + 1. **Collect Connection Details** + - You will be called by the Root Agent when database connection details are needed. + - Politely request the following information from the user: + ``` + To proceed with database operations, I need your connection details. + Please provide: + * **Host:** (e.g., localhost, server.example.com) + * **Port:** (e.g., 5432 for PostgreSQL, 3306 for MySQL, 1433 for MSSQL) + * **Database Name:** (The specific database to connect to) + * **User:** (Database username) + * **Password:** (Database password) + * **Database Type:** One of "postgresql", "mysql", or "mssql" + ``` + - Do **not** proceed to validation until all fields are provided. + - If any field is missing, politely ask only for the missing detail(s). + - When creating the connection details map for the tool call, ensure that the user-provided information is mapped to these exact keys: + - `"host"`, `"port"`, `"dbname"`, `"user"`, `"password"`, `"db_type"` + + 2. **Validate the Connection** + - Once all details are collected, call the `validate_db_connection` tool. + - Pass the gathered information as a single dictionary argument named `connection_details`. + + 3. **Handle Validation Response** + - **On Success:** + 1. Acknowledge that the database connection was successful. + 2. Retrieve the list of available schemas from the tool’s output (`schemas` key). + 3. **You MUST generate a response containing a raw Markdown bulleted list** to display the schemas. Construct the list string as shown below. + + - **Raw Markdown Output Example:** + The text you output should be exactly like this, including newlines: + ``` + Connection successful! Here are the available schemas: + - schema1 - schema2 - - schema3" - 4. Ask the user to specify which schema they want to analyze: - "\n\nPlease type the name of the schema you would like to analyze." - 5. Your task ends here. The user's next message will be the schema name. + - schema3 + + Please type the name of the schema you would like to analyze. + ``` + Replace `schema1`, `schema2`, etc., with the actual schema names from the tool result, ensuring each schema starts with '- ' on a new line. - * **On Error:** If the tool returns an "error" status, display the error message from the tool to the user and ask if they would like to try again. + - **On Error:** + - Inform the user that there was an issue connecting to the database in a user-friendly way. + - Politely ask if they would like to try again. + - **Never** display or expose the raw database error message or any sensitive details . Example: "I was unable to connect to the database. Please check the details and let me know if you'd like to try again." + --- ### Notes - * Always maintain a polite and professional tone. - * You do not know what the user will select. Do not attempt to confirm a selection. - * You do not connect to the database or modify session state yourself; you ONLY collect details, use the `validate_db_connection` tool, and report the results. + - Maintain a polite and professional tone throughout. + - Your output for the schema list must be the raw text representing the Markdown table, not a visual rendering. + - Do **not** connect directly to the database or modify session state yourself. Your role is limited to collecting inputs, calling `validate_db_connection`, and formatting the results as instructed. + - Never reveal or echo back the user’s password. + - Do not assume or confirm which schema the user will select. Your task ends after presenting the list of schemas and asking the user to choose. + - If the user asks for database connection details, you may display the host, port, and database name, but you must **never** reveal the password or any sensitive credentials. """, tools=[ validate_db_connection diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py index 2718f38..19d2a60 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -137,11 +137,3 @@ async def validate_db_connection(connection_details: Dict[str, Any], tool_contex if "db_creds_temp" in tool_context.state: del tool_context.state["db_creds_temp"] return {"status": "error", "message": f"Connection/Schema fetch failed for {db_type}: {e}"} - - finally: - if conn: - try: - conn.close() - logger.info(f"{db_type.upper()} connection closed.") - except Exception as e: - logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py index c2d3633..ecc778d 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py @@ -1,66 +1,98 @@ +import json +from decimal import Decimal from google.adk.agents.llm_agent import LlmAgent from google.adk.agents.readonly_context import ReadonlyContext -import json + +def json_encoder_default(obj): + if isinstance(obj, Decimal): + return str(obj) + raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") def qa_agent_instruction(ctx: ReadonlyContext) -> str: - """Dynamically builds the QA agent's instruction, providing the schema structure.""" + """Builds the QA agent's instruction for schema and data profiling queries.""" + schema_structure = ctx.state.get("schema_structure") + data_profile = ctx.state.get("data_profile") selected_schema = ctx.state.get("selected_schema", "the selected schema") + # Handle missing schema if not schema_structure: return f""" ### Role - You are a Database Schema Q&A Assistant. However, the schema details for '{selected_schema}' are not available. + You are a Database Schema & Data Profile Q&A Assistant. ### Task - Inform the user that the schema information is missing and needs to be introspected first. - Example: "I don't have the schema details for '{selected_schema}' yet. Please run the schema discovery/introspection first." + I currently do not have the schema details for '{selected_schema}'. + To answer schema-related questions, the schema must be introspected first. + You might say: "I don't have the schema details yet. Would you like me to run schema discovery first?" """ - schema_json = json.dumps(schema_structure, indent=2) + try: + schema_json = json.dumps(schema_structure, indent=2, default=json_encoder_default) + except Exception as e: + schema_json = f"Error serializing schema structure: {e}" + + # Handle data profiling + profile_message = "" + if data_profile: + try: + # Only display human-readable summary, not raw session variables + profile_summary = { + "Nullability": data_profile.get("nullability", "Not available"), + "Cardinality": data_profile.get("cardinality", "Not available"), + "Orphan Records": data_profile.get("orphan_records", "Not available"), + "Type Anomalies": data_profile.get("type_anomalies", "Not available") + } + profile_message = json.dumps(profile_summary, indent=2, default=json_encoder_default) + except Exception: + profile_message = "Data profiling results exist but could not be summarized." + else: + profile_message = ( + "Data profiling has not been run yet. " + "If you would like, I can run data profiling on this database " + "(sampling up to 10,000 rows) and provide a summary of key findings." + ) return f""" ### Role - You are a Database Schema Q&A Assistant. Your goal is to answer user questions based *only* on the provided database schema structure. + You are a Database Schema & Data Profile Q&A Assistant. Your goal is to answer user questions + about the database schema and data profiling in a conversational, human-friendly way. ### Schema Context for '{selected_schema}' - The following JSON object contains the discovered schema details, including tables, columns, data types, constraints, indexes, views, foreign keys, inferred relationships, and anomalies: - + The schema has been discovered and includes tables, columns, constraints, and relationships. ```json {schema_json} ``` - ### Instructions - 1. **Analyze the Question:** Carefully understand what information the user is asking for. The question will be the user's input query. - 2. **Consult Schema Context:** Base your answer *exclusively* on the JSON data provided above. Do not infer or assume any information not present. - 3. **Extract Information:** Navigate the JSON structure to find the relevant details. - 4. **Formulate Answer:** Provide a clear, concise answer to the user's question. - * If listing items, use bullet points. - * If describing a table or column, be specific about its properties. - 5. **Handle Missing Information (AC 5.5):** If the user asks about a table, column, or concept not found in the provided JSON, state clearly that the information is not available in the analyzed schema. Example: "The table 'X' was not found in the schema '{selected_schema}'." + ### Data Profiling Context for '{selected_schema}' + {profile_message} - ### Examples of How to Answer: + ### Instructions + 1. Answer questions only based on the provided schema structure and data profiling information. + 2. Avoid exposing raw internal session variables or empty lists directly. Answer conversationally. + 3. If data profiling has not been run and the user asks about it, politely suggest running profiling on up to 10,000 rows. + 4. If the user asks to generate a **Mermaid diagram** of the schema or to **export the schema structure as a JSON response**, transfer the request to the `reporting_agent` by calling: + `transfer_to_agent(reporting_agent, query)` + 5. Use tables for lists when helpful. + 6. If a question is outside your scope, guide the user to the appropriate agent instead. - * **"List all tables":** Extract keys from the `tables` object. - * **"How many tables are there?":** Count the keys in the `tables` object. - * **"What are the columns in the 'patients' table?":** Look up `tables['patients']['columns']` and list the column names and their types. - * **"Describe the 'email' column in the 'users' table":** Find `tables['users']['columns']['email']` and list all its properties (type, nullable, default, etc.). - * **"What are the constraints on the 'users' table?":** List the items in `tables['users']['constraints']`. - * **"Show me indexes for the 'orders' table":** List items from `tables['orders']['indexes']`. - * **"Are there any views?":** Check if the `views` object has entries. List them if present. - * **"Show me the SQL definition for the view 'active_customers'":** Retrieve the value of `views['active_customers']['definition']`. - * **"List foreign keys for the 'order_items' table":** Filter the `foreign_keys` list where `from_table` is 'order_items'. - * **"Which tables have a foreign key to the 'products' table?":** Filter the `foreign_keys` list where `to_table` is 'products'. - * **"Any inferred relationships for 'user_id'?":** Check the `inferred_relationships` list for entries involving 'user_id'. - * **"Are there any relationship anomalies?":** Report findings from the `anomalies` list. - * **"What is the data type of 'created_at' in 'audits'?":** Get `tables['audits']['columns']['created_at']['type']`. + ### Examples + * "List all tables": List tables from the schema. + * "Columns in 'customers'?": List columns for that table. + * "FKs for 'orders'?": List foreign keys involving that table. + * "Which columns have high nulls?": Refer to data profiling nullability. + * "Are there orphan records?": Summarize orphan records in a human-friendly way. + * "Any type anomalies?": List columns with type inconsistencies in plain language. + * "Generate a Mermaid diagram of the schema": Transfer to `reporting_agent`. + * "Export the schema as JSON": Transfer to `reporting_agent`. - Answer truthfully based *only* on the provided JSON data. + Always respond in clear, human-readable sentences. If profiling data is missing, offer to run profiling on a sample of up to 10,000 rows to provide a summary. """ qa_agent = LlmAgent( - model='gemini-2.5-flash', # Or a model better suited for JSON interpretation if needed + model='gemini-2.5-flash', name='qa_agent', - description='Answers natural language questions about the discovered database schema structure.', + description='Answers natural language questions about the discovered database schema structure and data profiling results.', instruction=qa_agent_instruction, + tools=[] ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py index 3c88285..7b38717 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py @@ -18,35 +18,40 @@ ### Tasks Based on the user's request, call the appropriate tool: - 1. **Summary Report (AC 5.1):** + 1. **Summary Report:** - If the user asks for a "summary", "overview", or "high-level report". - Call: `generate_summary_report()` - Present the `report_text` from the tool result to the user. - 2. **Export Full Report (AC 5.2):** - - If the user asks to "export", "get all data", "save report", or specifies a format like "JSON" or "YAML". - - Determine the format. Default to JSON if not specified. - - Call: `export_full_report(args={"format": "json"})` or `export_full_report(args={"format": "yaml"})`. + 2. **Export Full Report:** + - If the user asks to "export", "generate full report" or "report in JSON". + - The **default and only supported format** is **JSON**. + - If any other format is requested (CSV, XML, PDF, etc.), politely inform the user: + > I currently support exporting reports only in **JSON format**. + > Would you like me to generate the report in JSON instead? + - Call: `export_full_report(args={"format": "json"})`. - Inform the user the report is generated and provide the content within a code block. Example: - "Here is the full report in {tool_result.format} format: - ``` {tool_result.format.lower()} + "Here is the full report in JSON format: + ```json {tool_result.report_content} - ```" + ``` - 3. **Generate ERD Script (AC 5.3):** - - If the user asks for an "ERD", "diagram", "schema visual", "Mermaid script", or "PlantUML script". - - Currently, only Mermaid is supported. + 3. **Generate ERD Script:** + - When the user asks for an ERD diagram or schema visualization or mermaid script, generate a correct mermaid script without any additional comments - Call: `generate_erd_script()` - - Inform the user the script is generated and provide it in a Mermaid code block. Example: - "Here is the {tool_result.script_type} script for the ERD: + - As a response provide the user with list of 2 responses block + - First block dedicatedly contains the mermaid script as shown below. No PREAMBLE ```mermaid {tool_result.script} ``` - You can paste this into a {tool_result.script_type} renderer to visualize the schema." + - Second Block contains a message that says you can paste this into a {tool_result.script_type} renderer to visualize the schema and asks the user if there is anything that you can help with. 4. **Error Handling:** - - If a tool returns an error, relay the error message to the user. + - If a tool returns an error, relay an human friendly error message to the user without exposing any database or script details. - If required data (like `schema_structure`) is missing, guide the user to run the necessary previous steps (e.g., schema introspection). + + ### IMPORTANT + - If there is anything which is not in your scope or you cannot answer transfer the query to the root agent calling transfer_to_agent(data_model_discovery_agent, query) """, tools=[ generate_summary_report, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py index bcb3ecc..7b944a2 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -7,8 +7,27 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any] = None) -> Dict[str, Any]: - """Generates a high-level summary report of the database analysis.""" +async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + """ + Generates a high-level summary report of the database analysis. + + This tool reads the 'schema_structure' and 'data_profile' from the session state + to produce a markdown formatted text summary of the key findings from the + introspection and data profiling phases. + + Args: + tool_context: The ADK tool context, providing access to session state. + args: A dictionary for potential arguments (not used in this version). + + Returns: + A dictionary containing: + - status: "success" or "error". + - report_text: The markdown formatted summary report (on success). + - error: An error message (on failure). + """ + if not isinstance(args, dict): + return {"error": "Invalid arguments. Expected a dictionary for args."} + schema_structure = tool_context.state.get("schema_structure") data_profile = tool_context.state.get("data_profile") selected_schema = tool_context.state.get("selected_schema", "N/A") @@ -48,89 +67,212 @@ async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any return {"status": "success", "report_text": report} -async def export_full_report(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: - """Exports the full schema structure and data profile as JSON or YAML.""" +import json +import logging + +logger = logging.getLogger(__name__) + +async def export_full_report(tool_context: ToolContext, args: dict) -> dict: + """ + Exports the full schema structure and data profile as a clean JSON report. + + Only JSON is supported. Backslashes are avoided in the output. + + Args: + tool_context: The ADK tool context providing session state. + args: A dictionary containing optional 'format' key. + + Returns: + Dict[str, Any]: { + "status": "success" | "error", + "message": Description, + "report_content": JSON string (pretty-printed), + "format": "JSON", + "error": Optional error message + } + """ + if not isinstance(args, dict): + return {"status": "error", "error": "Invalid arguments. Expected a dictionary for args."} + schema_structure = tool_context.state.get("schema_structure") data_profile = tool_context.state.get("data_profile") - format = args.get("format", "json").lower() if not schema_structure: - return {"error": "Schema structure not found. Please run introspection first."} + return {"status": "error", "error": "Schema structure not found. Please run introspection first."} + + requested_format = args.get("format", "json").lower() + if requested_format != "json": + return {"status": "error", "error": f"Unsupported format '{requested_format}'. Only JSON is supported."} - full_report_data = { + full_report = { "schema_structure": schema_structure, - "data_profile": data_profile or "Not run", + "data_profile": data_profile or "Not run" } + def safe_encoder(obj): + """ + Converts any non-serializable object into string automatically. + Handles Decimal, datetime, UUID, set, custom objects, etc. + """ + try: + # First try normal encoding + return json.JSONEncoder().default(obj) + except Exception: + # Fallback: convert everything else to string + return str(obj) + try: - if format == "yaml" or format == "yml": - output = yaml.dump(full_report_data, indent=2, sort_keys=False) - file_type = "YAML" - else: # Default to JSON - output = json.dumps(full_report_data, indent=2) - file_type = "JSON" + json_output = json.dumps( + full_report, + indent=2, + ensure_ascii=False, + default=safe_encoder + ) return { "status": "success", - "message": f"Full report generated in {file_type} format. You can copy the content below.", - "report_content": output, - "format": file_type + "message": "Full report generated in JSON format. You can copy the content below.", + "report_content": json_output, + "format": "JSON" } + except Exception as e: - logger.error(f"Error generating {format} report: {e}", exc_info=True) - return {"error": f"Failed to generate {format} report: {str(e)}"} + logger.error(f"Error generating JSON report: {e}", exc_info=True) + return {"status": "error", "error": f"Failed to generate JSON report: {str(e)}"} + async def generate_erd_script(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: - """Generates a Mermaid script for an Entity Relationship Diagram.""" - schema_structure = tool_context.state.get("schema_structure") - selected_schema = tool_context.state.get("selected_schema", "Schema") + """ + Generates a complete, valid Mermaid ER Diagram script. + + This function uses 'schema_structure' from the tool context's session state + to build a fully compliant Mermaid ERD. It includes tables, columns, data + types, constraints, and both explicit and inferred relationships. + + Automatically fixes known issues: + - Normalizes table names to uppercase. + - Removes invalid precision (e.g., decimal(10,2) -> decimal). + - Escapes quotes and special characters for Mermaid syntax. + - Ensures all sections render correctly. + + Args: + tool_context: The ADK tool context providing session state. + args: Optional argument dictionary (currently unused). + + Returns: + Dict[str, Any]: { + "status": "success" | "error", + "message": Description message, + "script_type": "Mermaid", + "script": Mermaid ERD text (if success), + "error": Optional error message (if failure) + } + """ + if not isinstance(args, dict): + return { + "status": "error", + "error": "Invalid arguments. Expected a dictionary for args." + } + + schema_structure = tool_context.state.get("schema_structure") if not schema_structure: - return {"error": "Schema structure not found. Please run introspection first."} + return { + "status": "error", + "error": "Schema structure not found. Please run introspection first." + } - tables = schema_structure.get("tables", {}) - fks = schema_structure.get("foreign_keys", []) - inferred = schema_structure.get("inferred_relationships", []) + def sanitize_datatype(dtype: str) -> str: + """Normalize SQL data types to Mermaid-safe equivalents.""" + if not dtype: + return "text" + dtype = dtype.strip().lower() + if dtype.startswith("decimal"): + return "decimal" + if dtype.startswith("varchar"): + return "varchar" + if dtype.startswith("numeric"): + return "numeric" + if "int" in dtype: + return "int" + if dtype.startswith("enum"): + return "enum" + if "timestamp" in dtype: + return "timestamp" + return dtype.replace("(", "").replace(")", "").replace(",", "").replace(" ", "_") + + def format_column(table_name: str, col_name: str, col_info: Dict[str, Any], constraints_info: List[Dict[str, Any]]) -> str: + """Format a column entry with proper constraints for Mermaid.""" + dtype = sanitize_datatype(col_info.get("type", "text")) + constraints = [] + + for c in constraints_info: + if col_name in c.get("columns", []): + c_type = c.get("type", "").upper() + if "PRIMARY" in c_type: + constraints.append("PK") + elif "UNIQUE" in c_type: + constraints.append("UK") + + if not col_info.get("nullable", True): + constraints.append("NN") - mermaid_script = "erDiagram\n" + for fk in schema_structure.get("foreign_keys", []): + if ( + fk.get("from_column") == col_name + and fk.get("from_table", "").lower() == table_name.lower() + ): + constraints.append("FK") + break - # Add entities and attributes + constraint_str = f' "{", ".join(constraints)}"' if constraints else "" + return f" {dtype} {col_name}{constraint_str}" + + lines = ["erDiagram"] + + tables = schema_structure.get("tables", {}) for table_name, table_info in tables.items(): - mermaid_script += f' {table_name} {{\n' + tname = table_name.upper() + lines.append(f" {tname} {{") + columns = table_info.get("columns", {}) + constraints_info = table_info.get("constraints", []) + for col_name, col_info in columns.items(): - col_type = col_info.get("type", "") - constraints = [] - for const in table_info.get("constraints", []): - if const.get("columns") == col_name: - if const.get("type") == "PRIMARY KEY": constraints.append("PK") - if const.get("type") == "UNIQUE": constraints.append("UK") - if not col_info.get("nullable"): constraints.append("NN") - - constraint_str = f" \"{', '.join(constraints)}\"" if constraints else "" - mermaid_script += f' {col_type} {col_name}{constraint_str}\n' - mermaid_script += ' }\n' - - # Add relationships - for fk in fks: - from_table = fk.get("from_table") - to_table = fk.get("to_table") - from_column = fk.get("from_column") - # label = fk.get("constraint_name", "") - mermaid_script += f' {from_table} ||--o{{ {to_table} : "{from_column}"\n' - - # Add inferred relationships + lines.append(format_column(table_name, col_name, col_info, constraints_info)) + + lines.append(" }") + lines.append("") + + fks = schema_structure.get("foreign_keys", []) + if fks: + lines.append(" %% -- Explicit Relationships --") + for fk in fks: + from_table = fk.get("from_table", "").upper() + to_table = fk.get("to_table", "").upper() + from_column = fk.get("from_column", "") + if from_table and to_table: + lines.append(f' {from_table} ||--o{{ {to_table} : "{from_column}"') + + inferred = schema_structure.get("inferred_relationships", []) if inferred: - mermaid_script += "\n %% Inferred Relationships\n" + lines.append("\n %% -- Inferred Relationships --") for rel in inferred: - from_table = rel.get("from_table") - to_table = rel.get("to_table") - from_column = rel.get("from_column") - mermaid_script += f' {from_table} ..o{{ {to_table} : "Inferred: {from_column}"\n' + from_table = rel.get("from_table", "").upper() + to_table = rel.get("to_table", "").upper() + from_column = rel.get("from_column", "") + + if from_table and to_table: + # Optional → Optional: }o--o{ + lines.append( + f' {from_table} }}o--o{{ {to_table} : "INFERRED: {from_column}"' + ) + + mermaid_script = "\n".join(lines) + "\n" return { "status": "success", - "message": "Mermaid ERD script generated. You can render this in a Mermaid renderer.", + "message": "Mermaid ERD script generated successfully. Paste this code into any Mermaid renderer.", "script_type": "Mermaid", "script": mermaid_script } \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py index 1e3a32b..846b8ef 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py @@ -1,5 +1,4 @@ from google.adk.agents.llm_agent import LlmAgent -from google.adk.tools import FunctionTool from .tools import get_schema_details import json @@ -9,11 +8,33 @@ description='Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.', instruction=""" ### Role - You are a Database Schema Introspection Agent. Your task is to analyze the structure of a selected database schema. + You are a **Database Schema Introspection Agent**. Your sole task is to fetch and summarize the schema structure of a database. + + ### Scope + - You can only report **schema-level information**: tables, columns, constraints, indexes, foreign keys, inferred relationships, and anomalies. + - Do **not** answer questions about data content, queries, or performance. Forward all other questions to the QA agent using: + ```python + transfer_to_agent(qa_agent, query) + ``` + + ### Formatting + - Present table-like data using proper pipe tables: + +------------------+------------------+------------------+ + | Column 1 | Column 2 | Column 3 | + +------------------+------------------+------------------+ + | Row 1, Col 1 | Row 1, Col 2 | Row 1, Col 3 | + |------------------+------------------+------------------| + | Row 2, Col 1 | Row 2, Col 2 | Row 2, Col 3 | + |------------------+------------------+------------------| + | Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 | + +------------------+------------------+------------------+ ### Task - 1. **Receive Schema Name:** The user's query to this agent (available as the variable `query`) IS the schema name. - 2. **Call Tool:** Invoke `get_schema_details(args={"schema_name": query})`. + + 1. **Receive Schema Name:** The user's query to this agent (available as the variable `query`) IS the schema name to be introspected. + + 2. **Call Tool:** Invoke the `get_schema_details` tool. You MUST pass the schema name as a dictionary to the `args` parameter of the tool. + - **Tool Call:** `get_schema_details(args={"schema_name": query})` 3. **Process Results:** - If the tool call returns `status`: "success": @@ -24,19 +45,28 @@ "I have successfully introspected the schema '{tool_result.schema_name}'. Here's a summary of what I found: - **Tables:** {tool_result.summary.tables} (with {tool_result.summary.columns} columns in total) - **Views:** {tool_result.summary.views} - - **Constraints:** {tool_result.summary.constraints} - - **Indexes:** {tool_result.summary.indexes} + - **Constraints:** {tool_result.summary.constraints} (Across all tables) + - **Indexes:** {tool_result.summary.indexes} (Across all tables) - **Explicit Foreign Keys:** {tool_result.summary.explicit_fks} - **Potential Inferred Relationships:** {tool_result.summary.inferred_relationships} - - **Relationship Anomalies Detected:** {tool_result.summary.anomalies} + - **Schema Relationship Anomalies Detected:** {tool_result.summary.anomalies} - The full details are stored. What would you like to explore further about the '{tool_result.schema_name}' schema? You can ask things like: + The full details are stored. What would you like to explore further about the '{tool_result.schema_name}' schema? I can help you with: - 'List all tables.' - 'Describe the table .' - 'Show foreign keys involving the table.' - - 'Tell me about any anomalies found.'" + - 'Tell me about any anomalies found.' + - 'List any inferred relationships.'" + + - If the tool call returns an error, follow the **Error Handling** instruction above. - - If the tool call returns an error, relay the error message to the user. + ### IMPORTANT + - If there is anything which is not in your scope or you cannot answer transfer the query to the root agent calling transfer_to_agent(data_model_discovery_agent, query) + - For anything outside this scope, immediately call: + ```python + transfer_to_agent(qa_agent, query) + ``` + - Focus **only** on fetching and summarizing schema details. """, tools=[ get_schema_details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py index 9a52acd..08cb65e 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -62,7 +62,6 @@ async def get_schema_details(tool_context: ToolContext, args: Dict[str, Any]) -> return {"error": "schema_name not provided in args or is empty."} schema_name = str(schema_name).strip() - # ... (state checks as before) ... db_conn_state = tool_context.state.get("db_connection") db_creds = tool_context.state.get("db_creds_temp") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index 1a784b5..6cc5b74 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -1,154 +1,229 @@ import logging from typing import Dict, Any, List import pyodbc +import json +import os +import re +from google import genai +from google.api_core import exceptions +from google.genai import types +import google.auth logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# --- Copied LLM Client Setup & Helper Functions --- +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + +if not GOOGLE_CLOUD_PROJECT: + logger.warning("GOOGLE_CLOUD_PROJECT not set.") + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +MODEL = os.environ.get("MODEL", "gemini-1.5-pro") + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info(f"GenAI Client initialized in mssql_utils.") + except Exception as e: + logger.error(f"Failed to initialize GenAI Client in mssql_utils: {e}") +else: + logger.error("Cannot initialize GenAI Client in mssql_utils: GOOGLE_CLOUD_PROJECT is not set.") + +def _construct_llm_prompt(schema_name: str, db_type:str, schema_details: Dict[str, Any]) -> str: + # ... This function is IDENTICAL to the one in mysql_utils.py ... + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []) + } + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []) + } + context_json = json.dumps(context, indent=4) + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + +def _extract_json_content(text: str) -> str: + """Extracts JSON content from Markdown-style code fences (```json ... ```).""" + if not text: return "" + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + extracted = match.group(1).strip() if match else text.strip() + try: + parsed = json.loads(extracted) + return json.dumps(parsed, indent=4) + except json.JSONDecodeError: + return extracted + +def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return {"inferred_relationships": [], "anomalies": [{"error": "LLM client not available."}]} + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + # logger.info(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], + config=types.GenerateContentConfig(response_mime_type="application/json"), + ) + generated_text = response.candidates[0].content.parts[0].text + # logger.info(f"****** Raw LLM Response: {generated_text}") + cleaned_json = _extract_json_content(generated_text) + # logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError("LLM response is not in the expected list format for keys.") + return {"inferred_relationships": inferred, "anomalies": anomalies} + except json.JSONDecodeError as e: + logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") + return {"inferred_relationships": [], "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}]} + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return {"inferred_relationships": [], "anomalies": [{"error": f"LLM analysis failed: {e}"}]} + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}") + return {"inferred_relationships": [], "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}]} +# --- End Copied LLM Functions --- def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: - """Executes a SQL query and returns results as a list of dicts.""" + """Executes a SQL query and returns results as a list of dicts for SQL Server.""" cursor = conn.cursor() try: cursor.execute(query) - columns = [column[0] for column in cursor.description] - results = [] - for row in cursor.fetchall(): - results.append(dict(zip(columns, row))) - return results + if cursor.description: + columns = [column[0] for column in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row)) for row in rows] + return [] finally: cursor.close() def get_mssql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: logger.info(f"Fetching MSSQL schema details for: {schema_name}") - details = {"tables": {}, "views": {}, "foreign_keys": []} + details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} - # Tables - tables_query = f""" - SELECT TABLE_NAME - FROM INFORMATION_SCHEMA.TABLES - WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE'; - """ + tables_query = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE';" tables = _execute_query(conn, tables_query) - for table in tables: t_name = table['TABLE_NAME'] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} - - # Columns - cols_query = f""" - SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT - FROM INFORMATION_SCHEMA.COLUMNS - WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}'; + cols_query = f"SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}';" + for col in _execute_query(conn, cols_query): + details["tables"][t_name]["columns"][col['COLUMN_NAME']] = {"type": col['DATA_TYPE'], "length": col['CHARACTER_MAXIMUM_LENGTH'], "precision": col['NUMERIC_PRECISION'], "scale": col['NUMERIC_SCALE'], "nullable": col['IS_NULLABLE'] == 'YES', "default": col['COLUMN_DEFAULT']} + constraints_query = f""" + SELECT KCU.TABLE_NAME, TC.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME, CC.CHECK_CLAUSE + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND KCU.TABLE_NAME = '{t_name}'; """ - columns = _execute_query(conn, cols_query) - for col in columns: - details["tables"][t_name]["columns"][col['COLUMN_NAME']] = { - "type": col['DATA_TYPE'], - "length": col['CHARACTER_MAXIMUM_LENGTH'], - "precision": col['NUMERIC_PRECISION'], - "scale": col['NUMERIC_SCALE'], - "nullable": col['IS_NULLABLE'] == 'YES', - "default": col['COLUMN_DEFAULT'], - } - - # Constraints (PK, UNIQUE, CHECK) - constraints_query = f""" - SELECT - KCU.TABLE_NAME, - TC.CONSTRAINT_NAME, - TC.CONSTRAINT_TYPE, - KCU.COLUMN_NAME, - CC.CHECK_CLAUSE - FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC - LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU - ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME - LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC - ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA - WHERE TC.TABLE_SCHEMA = '{schema_name}'; - """ - constraints = _execute_query(conn, constraints_query) - for const in constraints: - t_name = const['TABLE_NAME'] - if t_name in details["tables"]: - details["tables"][t_name]["constraints"].append({ - "name": const['CONSTRAINT_NAME'], - "type": const['CONSTRAINT_TYPE'], - "columns": const['COLUMN_NAME'], - "check_clause": const['CHECK_CLAUSE'], - }) - - # Indexes - indexes_query = f""" - SELECT - t.name AS table_name, - ind.name AS index_name, - COL_NAME(ic.object_id, ic.column_id) AS column_name, - ind.is_unique - FROM sys.indexes ind - INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id - INNER JOIN sys.tables t ON ind.object_id = t.object_id - INNER JOIN sys.schemas s ON t.schema_id = s.schema_id - WHERE s.name = '{schema_name}' AND ind.is_hypothetical = 0 AND ind.is_primary_key = 0 AND ind.type > 0 - ORDER BY t.name, ind.name, ic.key_ordinal; - """ - try: - indexes = _execute_query(conn, indexes_query) - for index in indexes: - t_name = index['table_name'] - if t_name in details["tables"]: - idx_name = index['index_name'] - if not idx_name: continue - found = False - for existing_idx in details["tables"][t_name]["indexes"]: - if existing_idx["name"] == idx_name: - if index['column_name'] not in existing_idx["columns"]: - existing_idx["columns"].append(index['column_name']) - found = True - break - if not found: - details["tables"][t_name]["indexes"].append({ - "name": idx_name, - "columns": [index['column_name']], - "unique": index['is_unique'] - }) - except Exception as e: - logger.error(f"Error fetching MSSQL indexes: {e}") + details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + indexes_query = f""" + SELECT t.name AS table_name, ind.name AS index_name, COL_NAME(ic.object_id, ic.column_id) AS column_name, ind.is_unique + FROM sys.indexes ind INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id + INNER JOIN sys.tables t ON ind.object_id = t.object_id INNER JOIN sys.schemas s ON t.schema_id = s.schema_id + WHERE s.name = '{schema_name}' AND t.name = '{t_name}' AND ind.is_hypothetical = 0 AND ind.is_primary_key = 0 AND ind.type > 0; + """ + try: + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + idx_name = index['index_name'] + if not idx_name: continue + if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} + if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + except Exception as e: logger.error(f"Error fetching MSSQL indexes for {t_name}: {e}") - # Foreign Keys fks_query = f""" - SELECT - KCU1.CONSTRAINT_NAME AS fk_constraint_name - ,KCU1.TABLE_SCHEMA AS from_schema - ,KCU1.TABLE_NAME AS from_table - ,KCU1.COLUMN_NAME AS from_column - ,KCU2.TABLE_SCHEMA AS to_schema - ,KCU2.TABLE_NAME AS to_table - ,KCU2.COLUMN_NAME AS to_column + SELECT KCU1.CONSTRAINT_NAME AS fk_constraint_name, KCU1.TABLE_SCHEMA AS from_schema, KCU1.TABLE_NAME AS from_table, KCU1.COLUMN_NAME AS from_column, + KCU2.TABLE_SCHEMA AS to_schema, KCU2.TABLE_NAME AS to_table, KCU2.COLUMN_NAME AS to_column FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS RC - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 - ON KCU1.CONSTRAINT_CATALOG = RC.CONSTRAINT_CATALOG - AND KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA - AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 - ON KCU2.CONSTRAINT_CATALOG = RC.UNIQUE_CONSTRAINT_CATALOG - AND KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA - AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME - AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 ON KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 ON KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION WHERE KCU1.TABLE_SCHEMA = '{schema_name}'; """ - try: - details["foreign_keys"] = _execute_query(conn, fks_query) - except Exception as e: - logger.error(f"Error fetching MSSQL foreign keys: {e}") - details["foreign_keys"] = [{"error": str(e)}] - - # Views - views_query = f""" - SELECT TABLE_NAME AS view_name, VIEW_DEFINITION - FROM INFORMATION_SCHEMA.VIEWS - WHERE TABLE_SCHEMA = '{schema_name}'; - """ - views = _execute_query(conn, views_query) - for view in views: - details["views"][view['view_name']] = {"definition": view['VIEW_DEFINITION']} + details["foreign_keys"] = _execute_query(conn, fks_query) + views_query = f"SELECT TABLE_NAME AS view_name, VIEW_DEFINITION FROM INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA = '{schema_name}';" + for view in _execute_query(conn, views_query): details["views"][view['view_name']] = {"definition": view['VIEW_DEFINITION']} + llm_analysis = _analyze_with_llm(schema_name, "Microsoft SQL Server", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for MSSQL.") + logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for MSSQL.") return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py index 3f0bff8..af41bfb 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -1,8 +1,43 @@ import logging from typing import Dict, Any, List import mysql.connector +import json +import os +import re +from google import genai +from google.api_core import exceptions +from google.genai import types +import google.auth logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + +if not GOOGLE_CLOUD_PROJECT: + logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +MODEL = "gemini-2.5-pro" + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info(f"GenAI Client initialized. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") + except Exception as e: + logger.error(f"Failed to initialize GenAI Client: {e}") +else: + logger.error("Cannot initialize GenAI Client: GOOGLE_CLOUD_PROJECT is not set.") def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts.""" @@ -13,6 +48,160 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() +def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" + + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []) + } + + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []) + } + + # Format JSON for readability + context_json = json.dumps(context, indent=4) + + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + +def _extract_json_content(text: str) -> str: + """ + Extracts JSON content from Markdown-style code fences (```json ... ```). + If no fences are present, returns the text as-is. + """ + if not text: + return "" + + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + if match: + extracted = match.group(1).strip() + else: + extracted = text.strip() + + # Try to pretty format if valid JSON + try: + parsed = json.loads(extracted) + return json.dumps(parsed, indent=4) + except json.JSONDecodeError: + return extracted + +def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}] + } + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + logger.info(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], + ) + generated_text = response.candidates[0].content.parts[0].text + logger.info(f"****** Raw LLM Response: {generated_text}") + + # 🔹 Extract JSON content (handles ```json blocks) + cleaned_json = _extract_json_content(generated_text) + logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + + # Parse the cleaned JSON + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError("LLM response is not in the expected list format for keys.") + + return { + "inferred_relationships": inferred, + "anomalies": anomalies + } + + except json.JSONDecodeError as e: + logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}] + } + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}] + } + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}") + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}] + } + def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: # For MySQL, schema_name is the database name. logger.info(f"Fetching MySQL schema details for: {schema_name}") @@ -22,85 +211,54 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: logger.error(f"MySQL change database failed: {err}") raise - details = {"tables": {}, "views": {}, "foreign_keys": []} + details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} - # Tables + # 1. Fetch Basic Schema Info tables_query = "SHOW FULL TABLES WHERE Table_type = 'BASE TABLE';" tables = _execute_query(conn, tables_query) table_names = [list(t.values())[0] for t in tables] for t_name in table_names: details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} - - # Columns cols_query = f"DESCRIBE `{t_name}`;" columns = _execute_query(conn, cols_query) for col in columns: details["tables"][t_name]["columns"][col['Field']] = { - "type": col['Type'], - "nullable": col['Null'] == 'YES', - "default": col['Default'], - "key": col['Key'], # PRI, UNI, MUL - "extra": col['Extra'], + "type": col['Type'], "nullable": col['Null'] == 'YES', "default": col['Default'], + "key": col['Key'], "extra": col['Extra'], } - # Constraints (PK, UNIQUE) constraints_query = f""" - SELECT - KCU.CONSTRAINT_NAME, - TC.CONSTRAINT_TYPE, - KCU.COLUMN_NAME - FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU - ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME - AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA - AND TC.TABLE_NAME = KCU.TABLE_NAME - WHERE TC.TABLE_SCHEMA = '{schema_name}' AND TC.TABLE_NAME = '{t_name}' - AND TC.CONSTRAINT_TYPE IN ('PRIMARY KEY', 'UNIQUE'); + SELECT KCU.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND TC.TABLE_NAME = '{t_name}' + AND TC.CONSTRAINT_TYPE IN ('PRIMARY KEY', 'UNIQUE', 'FOREIGN KEY', 'CHECK'); """ - constraints = _execute_query(conn, constraints_query) - for const in constraints: - details["tables"][t_name]["constraints"].append({ - "name": const['CONSTRAINT_NAME'], - "type": const['CONSTRAINT_TYPE'], - "columns": const['COLUMN_NAME'], - }) - # Note: MySQL CHECK constraints are in information_schema.CHECK_CONSTRAINTS - - # Indexes + details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + indexes_query = f"SHOW INDEX FROM `{t_name}`;" indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: idx_name = index['Key_name'] if idx_name not in grouped_indexes: - grouped_indexes[idx_name] = { - "name": idx_name, - "columns": [], - "unique": index['Non_unique'] == 0 - } + grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['Non_unique'] == 0} grouped_indexes[idx_name]["columns"].append(index['Column_name']) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) - # Foreign Keys fks_query = f""" - SELECT - KCU.TABLE_NAME AS from_table, - KCU.COLUMN_NAME AS from_column, - KCU.REFERENCED_TABLE_NAME AS to_table, - KCU.REFERENCED_COLUMN_NAME AS to_column, - KCU.CONSTRAINT_NAME - FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU - WHERE KCU.TABLE_SCHEMA = '{schema_name}' - AND KCU.REFERENCED_TABLE_NAME IS NOT NULL; + SELECT KCU.TABLE_NAME AS from_table, KCU.COLUMN_NAME AS from_column, + KCU.REFERENCED_TABLE_NAME AS to_table, KCU.REFERENCED_COLUMN_NAME AS to_column, KCU.CONSTRAINT_NAME + FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU + WHERE KCU.TABLE_SCHEMA = '{schema_name}' AND KCU.REFERENCED_TABLE_NAME IS NOT NULL; """ details["foreign_keys"] = _execute_query(conn, fks_query) - # Views views_query = "SHOW FULL TABLES WHERE Table_type = 'VIEW';" views = _execute_query(conn, views_query) - view_names = [list(v.values())[0] for v in views] - for v_name in view_names: + for v_name in [list(v.values())[0] for v in views]: try: definition_query = f"SHOW CREATE VIEW `{v_name}`;" definition = _execute_query(conn, definition_query) @@ -109,4 +267,16 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: logger.warning(f"Could not fetch view definition for {v_name}: {e}") details["views"][v_name] = {"definition": "N/A"} + # 2. LLM-based Analysis for Inferred Relationships and Anomalies + llm_analysis = _analyze_with_llm(schema_name, "MySQL", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + + logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships.") + logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies.") + + logger.info("************************") + logger.info(details) + logger.info("************************") + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index 6df5c77..22ecc76 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -1,10 +1,47 @@ import logging from typing import Dict, Any, List +import psycopg2 +import json +import os +import re +from google import genai +from google.api_core import exceptions +from google.genai import types +import google.auth logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + +# --- LLM Client Setup --- +try: + _, project_id = google.auth.default() + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) +except google.auth.exceptions.DefaultCredentialsError: + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + +if not GOOGLE_CLOUD_PROJECT: + logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") + +GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +MODEL = os.environ.get("MODEL", "gemini-1.5-pro") + +client = None +if GOOGLE_CLOUD_PROJECT: + try: + client = genai.Client( + vertexai=GOOGLE_GENAI_USE_VERTEXAI, + project=GOOGLE_CLOUD_PROJECT, + location=GOOGLE_CLOUD_LOCATION, + ) + logger.info(f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") + except Exception as e: + logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") +else: + logger.error("Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set.") def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: - """Executes a SQL query and returns results as a list of dicts.""" + """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() try: cursor.execute(query) @@ -16,125 +53,183 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() +def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" + tables_context = {} + for table_name, table_info in schema_details.get("tables", {}).items(): + tables_context[table_name] = { + "columns": list(table_info.get("columns", {}).keys()), + "constraints": table_info.get("constraints", []) + } + context = { + "db_type": db_type, + "schema_name": schema_name, + "tables": tables_context, + "existing_foreign_keys": schema_details.get("foreign_keys", []) + } + context_json = json.dumps(context, indent=4) + prompt = f""" + You are a database expert analyzing the schema of a {db_type} database named '{schema_name}'. + Your task is to identify potential inferred relationships and relationship anomalies based on the provided schema information. + + Here is the schema context: + ```json + {context_json} + ``` + + **Tasks:** + + 1. **Inferred Relationship Suggestion:** + Analyze the table and column names. Suggest potential foreign key relationships that are NOT already defined in `existing_foreign_keys`. + Common patterns include columns like `user_id`, `product_code`, `order_uuid`, etc., potentially linking to `id` or similar columns in other tables (e.g., `users.id`). + For each suggestion, provide the `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` (why you think it's related), and a `suggestion` (e.g., "Consider adding a foreign key"). + + 2. **Relationship Anomaly Detection:** + Examine the `existing_foreign_keys`. For each foreign key, check if the `to_table` and `to_column` exist in the `tables` context. Also, verify if the `to_column` in the `to_table` is part of a PRIMARY KEY or UNIQUE constraint in that table's constraints list. + Flag any anomalies where: + a. The `to_table` is not in the `tables` context. + b. The `to_column` is not in the `columns` list of the `to_table`. + c. The `to_column` in the `to_table` is NOT listed as a 'PRIMARY KEY' or 'UNIQUE' in its constraints. + For each anomaly, provide the `constraint_name`, `from_table`, `from_column`, `to_table`, `to_column`, an `explanation` of the issue, and a `suggestion` (e.g., "Verify target column exists" or "Target column should be PK/UK"). + + **Output Format:** + Return your findings as a single JSON object with two keys: "inferred_relationships" and "anomalies". The JSON must be well-formed. + + ```json + {{ + "inferred_relationships": [ + {{ + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ], + "anomalies": [ + {{ + "constraint_name": "string", + "from_table": "string", + "from_column": "string", + "to_table": "string", + "to_column": "string", + "explanation": "string", + "suggestion": "string" + }} + ] + }} + ``` + If no inferred relationships or anomalies are found, return empty lists for the respective keys. + """ + return prompt + +def _extract_json_content(text: str) -> str: + """Extracts JSON content from Markdown-style code fences (```json ... ```).""" + if not text: return "" + match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) + extracted = match.group(1).strip() if match else text.strip() + try: + parsed = json.loads(extracted) + return json.dumps(parsed, indent=4) + except json.JSONDecodeError: + return extracted + +def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + """Calls an LLM to get inferred relationships and anomalies.""" + if not client: + logger.error("GenAI Client not initialized. Skipping LLM analysis.") + return {"inferred_relationships": [], "anomalies": [{"error": "LLM client not available."}]} + + prompt = _construct_llm_prompt(schema_name, db_type, schema_details) + logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") + generated_text = "" + try: + # logger.info(f"****** Custom_LLM_Request: {prompt}") + response = client.models.generate_content( + model=MODEL, + contents=[types.Part.from_text(text=prompt)], + config=types.GenerateContentConfig(response_mime_type="application/json"), + ) + generated_text = response.candidates[0].content.parts[0].text + # logger.info(f"****** Raw LLM Response: {generated_text}") + cleaned_json = _extract_json_content(generated_text) + # logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + llm_output = json.loads(cleaned_json) + inferred = llm_output.get("inferred_relationships", []) + anomalies = llm_output.get("anomalies", []) + if not isinstance(inferred, list) or not isinstance(anomalies, list): + raise ValueError("LLM response is not in the expected list format for keys.") + return {"inferred_relationships": inferred, "anomalies": anomalies} + except json.JSONDecodeError as e: + logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") + return {"inferred_relationships": [], "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}]} + except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: + logger.error(f"Error calling LLM or processing response: {e}") + return {"inferred_relationships": [], "anomalies": [{"error": f"LLM analysis failed: {e}"}]} + except Exception as e: + logger.error(f"Unexpected error during LLM analysis: {e}") + return {"inferred_relationships": [], "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}]} + def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: - details = {"tables": {}, "views": {}, "foreign_keys": []} + details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") - # Tables and Columns - tables_query = f""" - SELECT table_name - FROM information_schema.tables - WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'; - """ + tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE';" tables = _execute_query(conn, tables_query) - for table in tables: t_name = table['table_name'] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} - cols_query = f""" SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default - FROM information_schema.columns - WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; + FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; """ - columns = _execute_query(conn, cols_query) - for col in columns: + for col in _execute_query(conn, cols_query): details["tables"][t_name]["columns"][col['column_name']] = { - "type": col['data_type'], - "length": col['character_maximum_length'], - "precision": col['numeric_precision'], - "scale": col['numeric_scale'], - "nullable": col['is_nullable'] == 'YES', - "default": col['column_default'], + "type": col['data_type'], "length": col['character_maximum_length'], "precision": col['numeric_precision'], + "scale": col['numeric_scale'], "nullable": col['is_nullable'] == 'YES', "default": col['column_default'], } + constraints_query = f""" + SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause + FROM information_schema.table_constraints tc + LEFT JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name + LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema + WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; + """ + details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + indexes_query = f""" + SELECT t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique + FROM pg_class t JOIN pg_index ix ON t.oid = ix.indrelid JOIN pg_class i ON i.oid = ix.indexrelid + LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) + JOIN pg_namespace n ON t.relnamespace = n.oid WHERE n.nspname = '{schema_name}' AND t.relname = '{t_name}' AND t.relkind = 'r'; + """ + try: + indexes = _execute_query(conn, indexes_query) + grouped_indexes = {} + for index in indexes: + if index['column_name']: + idx_name = index['index_name'] + if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} + if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) + details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) + except Exception as e: logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") - # Constraints (PK, UNIQUE, CHECK) - constraints_query = f""" - SELECT - tc.table_name, - tc.constraint_name, - tc.constraint_type, - kcu.column_name, - cc.check_clause - FROM information_schema.table_constraints tc - LEFT JOIN information_schema.key_column_usage kcu - ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name - LEFT JOIN information_schema.check_constraints cc - ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema - WHERE tc.table_schema = '{schema_name}' AND tc.table_name IN (SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'); - """ - constraints = _execute_query(conn, constraints_query) - for const in constraints: - t_name = const['table_name'] - if t_name in details["tables"]: - details["tables"][t_name]["constraints"].append({ - "name": const['constraint_name'], - "type": const['constraint_type'], - "columns": const['column_name'], - "check_clause": const['check_clause'], - }) - - # Indexes - indexes_query = f""" - SELECT - t.relname AS table_name, - i.relname AS index_name, - a.attname AS column_name, - ix.indisunique AS is_unique - FROM pg_class t - JOIN pg_index ix ON t.oid = ix.indrelid - JOIN pg_class i ON i.oid = ix.indexrelid - LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) - JOIN pg_namespace n ON t.relnamespace = n.oid - WHERE n.nspname = '{schema_name}' AND t.relkind = 'r'; - """ - try: - indexes = _execute_query(conn, indexes_query) - for index in indexes: - t_name = index['table_name'] - if t_name in details["tables"] and index['column_name']: - idx_name = index['index_name'] - found = False - for existing_idx in details["tables"][t_name]["indexes"]: - if existing_idx["name"] == idx_name: - if index['column_name'] not in existing_idx["columns"]: - existing_idx["columns"].append(index['column_name']) - found = True - break - if not found: - details["tables"][t_name]["indexes"].append({ - "name": idx_name, - "columns": [index['column_name']], - "unique": index['is_unique'] - }) - except Exception as e: - logger.error(f"Error fetching PostgreSQL indexes: {e}") - - # Foreign Keys fks_query = f""" - SELECT - tc.table_name AS from_table, - kcu.column_name AS from_column, - ccu.table_name AS to_table, - ccu.column_name AS to_column, - tc.constraint_name - FROM information_schema.table_constraints AS tc - JOIN information_schema.key_column_usage AS kcu + SELECT tc.constraint_name, tc.table_name AS from_table, kcu.column_name AS from_column, + ccu.table_schema AS to_schema, ccu.table_name AS to_table, ccu.column_name AS to_column + FROM information_schema.table_constraints AS tc JOIN information_schema.key_column_usage AS kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema JOIN information_schema.constraint_column_usage AS ccu ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = '{schema_name}'; """ details["foreign_keys"] = _execute_query(conn, fks_query) + views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" + for view in _execute_query(conn, views_query): details["views"][view['view_name']] = {"definition": view['view_definition']} - # Views - views_query = f""" - SELECT table_name AS view_name, view_definition - FROM information_schema.views - WHERE table_schema = '{schema_name}'; - """ - views = _execute_query(conn, views_query) - for view in views: - details["views"][view['view_name']] = {"definition": view['view_definition']} - + llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) + details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) + details["anomalies"] = llm_analysis.get("anomalies", []) + logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL.") + logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL.") return details From 7f8decf2d075a0735eadd745e6e0cda686c83f35 Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Wed, 19 Nov 2025 13:50:43 +0530 Subject: [PATCH 4/8] feat(data-model-discovery-agent): refactor code --- .../agent.excalidraw | 1020 ----------------- .../data_model_discovery_agent/agent.py | 2 +- .../utils/mssql_profiling_utils.py | 42 +- .../utils/postgres_profiling_utils.py | 46 +- .../sub_agents/database_cred_agent/tools.py | 3 +- .../database_introspection_agent/__init__.py | 1 - .../database_introspection_agent/agent.py | 43 - .../sub_agents/postgres_sql_agent/__init__.py | 1 - .../sub_agents/postgres_sql_agent/agent.py | 20 - .../database_introspection_agent/tools.py | 66 -- .../utils/mssql_utils.py | 134 ++- .../utils/mysql_utils.py | 13 +- .../utils/postgresql_utils.py | 8 +- 13 files changed, 139 insertions(+), 1260 deletions(-) delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py delete mode 100644 agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw b/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw deleted file mode 100644 index 03436f1..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/agent.excalidraw +++ /dev/null @@ -1,1020 +0,0 @@ -{ - "type": "excalidraw", - "version": 2, - "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", - "elements": [ - { - "id": "O-PaQXH396tStCF7JKxfI", - "type": "rectangle", - "x": 310.15625, - "y": 221.078125, - "width": 110.26171875, - "height": 88.63671875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#a5d8ff", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a0", - "roundness": { - "type": 3 - }, - "seed": 1469789404, - "version": 74, - "versionNonce": 876272100, - "isDeleted": false, - "boundElements": [ - { - "type": "text", - "id": "-GNllmBjF8GoBkCfEE2cx" - }, - { - "id": "PiKGRVvnyp7pjLD1tCi5u", - "type": "arrow" - }, - { - "id": "JU5XimKzjf75Jwi0acyvd", - "type": "arrow" - } - ], - "updated": 1761936287452, - "link": null, - "locked": false - }, - { - "id": "-GNllmBjF8GoBkCfEE2cx", - "type": "text", - "x": 320.28714752197266, - "y": 240.396484375, - "width": 89.99992370605469, - "height": 50, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a0V", - "roundness": null, - "seed": 351098332, - "version": 17, - "versionNonce": 1692608092, - "isDeleted": false, - "boundElements": [], - "updated": 1761936287452, - "link": null, - "locked": false, - "text": "db\ndiscovery", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "O-PaQXH396tStCF7JKxfI", - "originalText": "db discovery", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "IrO9UaKnLCpnbzqRSCJzy", - "type": "rectangle", - "x": 563.1640625, - "y": 126.328125, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a1", - "roundness": { - "type": 3 - }, - "seed": 110764636, - "version": 102, - "versionNonce": 1325511004, - "isDeleted": false, - "boundElements": [ - { - "id": "PiKGRVvnyp7pjLD1tCi5u", - "type": "arrow" - }, - { - "type": "text", - "id": "DYEQeZoLdTVJOmIEEHann" - } - ], - "updated": 1761936320962, - "link": null, - "locked": false - }, - { - "id": "DYEQeZoLdTVJOmIEEHann", - "type": "text", - "x": 571.3824615478516, - "y": 153.8046875, - "width": 103.69992065429688, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a1V", - "roundness": null, - "seed": 404114148, - "version": 12, - "versionNonce": 1590007012, - "isDeleted": false, - "boundElements": [], - "updated": 1761936163253, - "link": null, - "locked": false, - "text": "cred agent", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "IrO9UaKnLCpnbzqRSCJzy", - "originalText": "cred agent", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "pwSkpY2oOFIyCCIItNnKl", - "type": "rectangle", - "x": 671.876953125, - "y": 167.31640625, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a2", - "roundness": { - "type": 3 - }, - "seed": 356916452, - "version": 324, - "versionNonce": 901894620, - "isDeleted": false, - "boundElements": [ - { - "type": "text", - "id": "nER7F15wfOHOH0UQQSG_n" - } - ], - "updated": 1761936325675, - "link": null, - "locked": false - }, - { - "id": "nER7F15wfOHOH0UQQSG_n", - "type": "text", - "x": 685.5053482055664, - "y": 182.29296875, - "width": 92.87992858886719, - "height": 50, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a2V", - "roundness": null, - "seed": 1125348444, - "version": 160, - "versionNonce": 2044933724, - "isDeleted": false, - "boundElements": [], - "updated": 1761936325675, - "link": null, - "locked": false, - "text": "validation\nagent", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "pwSkpY2oOFIyCCIItNnKl", - "originalText": "validation\nagent", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "n4vdN0n3RQnDSPjq-nr99", - "type": "rectangle", - "x": 570.306640625, - "y": 349.5625, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#b2f2bb", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a3", - "roundness": { - "type": 3 - }, - "seed": 1738953820, - "version": 430, - "versionNonce": 22667364, - "isDeleted": false, - "boundElements": [ - { - "id": "JU5XimKzjf75Jwi0acyvd", - "type": "arrow" - }, - { - "id": "siBHscB5llCUfh4O0-BYR", - "type": "arrow" - }, - { - "id": "lQg3kQJ64cAzOQlsm_nnV", - "type": "arrow" - }, - { - "type": "text", - "id": "XUCvYXSXY4SnFXxP9sw4E" - } - ], - "updated": 1761936254839, - "link": null, - "locked": false - }, - { - "id": "XUCvYXSXY4SnFXxP9sw4E", - "type": "text", - "x": 593.5650329589844, - "y": 364.5390625, - "width": 73.61993408203125, - "height": 50, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a3V", - "roundness": null, - "seed": 62680932, - "version": 30, - "versionNonce": 1750968420, - "isDeleted": false, - "boundElements": [], - "updated": 1761936192476, - "link": null, - "locked": false, - "text": "db spec\nagent", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "n4vdN0n3RQnDSPjq-nr99", - "originalText": "db spec\nagent", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "4KwTb3Ksy9iKmaG6rp9rf", - "type": "rectangle", - "x": 774.623046875, - "y": 285.046875, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#b2f2bb", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a4", - "roundness": { - "type": 3 - }, - "seed": 705126116, - "version": 180, - "versionNonce": 1194355804, - "isDeleted": false, - "boundElements": [ - { - "type": "text", - "id": "L7IdWjNFhHkUSiIxsHLDr" - }, - { - "id": "siBHscB5llCUfh4O0-BYR", - "type": "arrow" - }, - { - "id": "il6TSwE83Z9g89nVNsKKQ", - "type": "arrow" - } - ], - "updated": 1761936257198, - "link": null, - "locked": false - }, - { - "id": "L7IdWjNFhHkUSiIxsHLDr", - "type": "text", - "x": 792.091438293457, - "y": 312.5234375, - "width": 85.19993591308594, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a4V", - "roundness": null, - "seed": 1748872412, - "version": 20, - "versionNonce": 1293173988, - "isDeleted": false, - "boundElements": [], - "updated": 1761935917615, - "link": null, - "locked": false, - "text": "postgres", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "4KwTb3Ksy9iKmaG6rp9rf", - "originalText": "postgres", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "bkFb3Nq2lFu--GSYS7px9", - "type": "rectangle", - "x": 781.837890625, - "y": 409.16796875, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#b2f2bb", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a5", - "roundness": { - "type": 3 - }, - "seed": 595614044, - "version": 147, - "versionNonce": 1679980380, - "isDeleted": false, - "boundElements": [ - { - "type": "text", - "id": "fu0fiBBjZBALJDnQu7-Lv" - }, - { - "id": "lQg3kQJ64cAzOQlsm_nnV", - "type": "arrow" - }, - { - "id": "qgF1zyHojO8Oq_8ebpZ4V", - "type": "arrow" - } - ], - "updated": 1761936259248, - "link": null, - "locked": false - }, - { - "id": "fu0fiBBjZBALJDnQu7-Lv", - "type": "text", - "x": 816.9062652587891, - "y": 436.64453125, - "width": 49.999969482421875, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a6", - "roundness": null, - "seed": 266223460, - "version": 8, - "versionNonce": 352620132, - "isDeleted": false, - "boundElements": [], - "updated": 1761936258198, - "link": null, - "locked": false, - "text": "mysql", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "bkFb3Nq2lFu--GSYS7px9", - "originalText": "mysql", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "aku4cEBO3VLvxLajPKOX6", - "type": "rectangle", - "x": 1091.208984375, - "y": 306.5546875, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a7", - "roundness": { - "type": 3 - }, - "seed": 1449833948, - "version": 648, - "versionNonce": 444918372, - "isDeleted": false, - "boundElements": [], - "updated": 1761936265152, - "link": null, - "locked": false - }, - { - "id": "BWyzPhPtf4XfBjxGoK1SC", - "type": "rectangle", - "x": 1098.244140625, - "y": 407.28125, - "width": 120.13671875, - "height": 79.953125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a8", - "roundness": { - "type": 3 - }, - "seed": 913800420, - "version": 495, - "versionNonce": 1727028060, - "isDeleted": false, - "boundElements": [], - "updated": 1761936269823, - "link": null, - "locked": false - }, - { - "id": "ye_64zqr7CDhyAiWzvOJm", - "type": "rectangle", - "x": 1050.873046875, - "y": 232.08984375, - "width": 201.48046875000006, - "height": 273.68359375, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "a9", - "roundness": { - "type": 3 - }, - "seed": 356938076, - "version": 696, - "versionNonce": 1666667868, - "isDeleted": false, - "boundElements": [ - { - "id": "il6TSwE83Z9g89nVNsKKQ", - "type": "arrow" - }, - { - "id": "qgF1zyHojO8Oq_8ebpZ4V", - "type": "arrow" - } - ], - "updated": 1761936221713, - "link": null, - "locked": false - }, - { - "id": "ZAleF85iz7usCWNXn2laL", - "type": "text", - "x": 1123.671875, - "y": 255.84375, - "width": 42.25996398925781, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aA", - "roundness": null, - "seed": 1498817892, - "version": 132, - "versionNonce": 551298012, - "isDeleted": false, - "boundElements": [], - "updated": 1761936079075, - "link": null, - "locked": false, - "text": "utils", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "utils", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "5S9Z3gdR-hzsrzK_vw80A", - "type": "text", - "x": 1108.55859375, - "y": 321.95703125, - "width": 95.15994262695312, - "height": 50, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aB", - "roundness": null, - "seed": 336976356, - "version": 117, - "versionNonce": 2027863268, - "isDeleted": false, - "boundElements": [], - "updated": 1761936040603, - "link": null, - "locked": false, - "text": "db result \nto MD", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "db result \nto MD", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "fToOun2xdKMOb2aVkf29p", - "type": "text", - "x": 1120.33203125, - "y": 422.76953125, - "width": 76.0599365234375, - "height": 50, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aE", - "roundness": null, - "seed": 585740644, - "version": 85, - "versionNonce": 1959805532, - "isDeleted": false, - "boundElements": [], - "updated": 1761936266468, - "link": null, - "locked": false, - "text": "MD to \nMermaid", - "fontSize": 20, - "fontFamily": 5, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "MD to \nMermaid", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "PiKGRVvnyp7pjLD1tCi5u", - "type": "arrow", - "x": 420.1875, - "y": 224.28125, - "width": 142.8125, - "height": 58.0546875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aF", - "roundness": { - "type": 2 - }, - "seed": 54532956, - "version": 282, - "versionNonce": 1459463140, - "isDeleted": false, - "boundElements": [], - "updated": 1761936119294, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 142.8125, - -58.0546875 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "O-PaQXH396tStCF7JKxfI", - "focus": -0.2816993651409218, - "gap": 5.510913037302248 - }, - "endBinding": { - "elementId": "IrO9UaKnLCpnbzqRSCJzy", - "focus": 0.38144607016235266, - "gap": 1 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - }, - { - "id": "JU5XimKzjf75Jwi0acyvd", - "type": "arrow", - "x": 423.2396548803663, - "y": 302.0355739110941, - "width": 146.20703125, - "height": 88.73046875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aG", - "roundness": { - "type": 2 - }, - "seed": 717361372, - "version": 418, - "versionNonce": 808204388, - "isDeleted": false, - "boundElements": [], - "updated": 1761936175410, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 146.20703125, - 88.73046875 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "O-PaQXH396tStCF7JKxfI", - "focus": 0.018883793865720252, - "gap": 5.699811232612768 - }, - "endBinding": { - "elementId": "n4vdN0n3RQnDSPjq-nr99", - "focus": -0.4998404324681728, - "gap": 1 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - }, - { - "id": "siBHscB5llCUfh4O0-BYR", - "type": "arrow", - "x": 694.1420490425966, - "y": 383.9550382997701, - "width": 80.4140625, - "height": 59.02734375, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aI", - "roundness": { - "type": 2 - }, - "seed": 665494364, - "version": 609, - "versionNonce": 449713244, - "isDeleted": false, - "boundElements": [], - "updated": 1761936148133, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 80.4140625, - -59.02734375 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "n4vdN0n3RQnDSPjq-nr99", - "focus": 0.4903538633420851, - "gap": 3.698689667596568 - }, - "endBinding": { - "elementId": "4KwTb3Ksy9iKmaG6rp9rf", - "focus": 0.526204109830513, - "gap": 1 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - }, - { - "id": "lQg3kQJ64cAzOQlsm_nnV", - "type": "arrow", - "x": 692.4222629558907, - "y": 405.1802276495595, - "width": 87.06203184912158, - "height": 54.37911917950322, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aJ", - "roundness": { - "type": 2 - }, - "seed": 1585799908, - "version": 782, - "versionNonce": 564994532, - "isDeleted": false, - "boundElements": [], - "updated": 1761936258199, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 87.06203184912158, - 54.37911917950322 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "n4vdN0n3RQnDSPjq-nr99", - "focus": -0.297658746552027, - "gap": 2.391218497804516 - }, - "endBinding": { - "elementId": "bkFb3Nq2lFu--GSYS7px9", - "focus": -0.6375065786945301, - "gap": 2.847062752195484 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - }, - { - "id": "il6TSwE83Z9g89nVNsKKQ", - "type": "arrow", - "x": 896.953125, - "y": 328.69140625, - "width": 152.453125, - "height": 0.7734375, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aK", - "roundness": { - "type": 2 - }, - "seed": 1539913948, - "version": 118, - "versionNonce": 1226485596, - "isDeleted": false, - "boundElements": [], - "updated": 1761936230052, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 152.453125, - -0.7734375 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "4KwTb3Ksy9iKmaG6rp9rf", - "focus": 0.09890046545733103, - "gap": 2.193359375 - }, - "endBinding": { - "elementId": "ye_64zqr7CDhyAiWzvOJm", - "focus": 0.302375871013406, - "gap": 1.466796875 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - }, - { - "id": "qgF1zyHojO8Oq_8ebpZ4V", - "type": "arrow", - "x": 905.0233071976285, - "y": 447.6495769914197, - "width": 144.8465371811934, - "height": 0.32010884601004364, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aL", - "roundness": { - "type": 2 - }, - "seed": 1109247588, - "version": 223, - "versionNonce": 1123501412, - "isDeleted": false, - "boundElements": [], - "updated": 1761936258199, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 144.8465371811934, - -0.32010884601004364 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "bkFb3Nq2lFu--GSYS7px9", - "focus": -0.03381093133160738, - "gap": 3.695205648243359 - }, - "endBinding": { - "elementId": "ye_64zqr7CDhyAiWzvOJm", - "focus": -0.5709895611084019, - "gap": 1.6873931482432454 - }, - "startArrowhead": null, - "endArrowhead": "arrow", - "elbowed": false - } - ], - "appState": { - "gridSize": 20, - "gridStep": 5, - "gridModeEnabled": false, - "viewBackgroundColor": "#ffffff" - }, - "files": {} -} \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py index c11b977..46681e7 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -177,7 +177,7 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: - Recall `schema_introspection_agent` and pass the schema name '{selected_schema}' as the input to it to ensure the structure is loaded. - Example AgentTool Call: `schema_introspection_agent("{selected_schema}")` """ - else: # Should ideally not be reached if states are managed well + else: return base_instruction + """ **Current Task:** Determine the next step based on the conversation history and session state. If unsure, ask the user for clarification. """ diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py index 428c949..80a3483 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -1,16 +1,19 @@ import logging from typing import Dict, Any, List +import pyodbc +from decimal import Decimal logger = logging.getLogger(__name__) def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for SQL Server.""" cursor = conn.cursor() try: cursor.execute(query) if cursor.description: columns = [column[0] for column in cursor.description] rows = cursor.fetchall() - return [dict(zip(columns, [val for val in row])) for row in rows] + return [dict(zip(columns, row)) for row in rows] return [] finally: cursor.close() @@ -25,7 +28,7 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, profile_results["cardinality"][table_name] = {} full_table_name = f"[{schema_name}].[{table_name}]" - # Nullability + # Nullability (AC 4.1) for col_name in table_info.get("columns", {}): null_q = f""" SELECT @@ -35,13 +38,15 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, null_q)[0] - null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 + total_count = int(res['total_count']) + null_count = int(res['null_count']) + null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 profile_results["nullability"][table_name][col_name] = round(null_pct, 2) except Exception as e: - logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + logger.error(f"Error profiling nulls for {full_table_name}.[{col_name}]: {e}") profile_results["nullability"][table_name][col_name] = "Error" - # Cardinality - PKs, FKs + # Cardinality (AC 4.2) - PKs, FKs key_columns = set() for const in table_info.get("constraints", []): if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): @@ -55,20 +60,21 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, card_q = f"SELECT COUNT(DISTINCT [{col_name}]) as unique_count FROM {full_table_name};" try: res = _execute_query(conn, card_q)[0] - profile_results["cardinality"][table_name][col_name] = res['unique_count'] + profile_results["cardinality"][table_name][col_name] = int(res['unique_count']) except Exception as e: - logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + logger.error(f"Error profiling cardinality for {full_table_name}.[{col_name}]: {e}") profile_results["cardinality"][table_name][col_name] = "Error" - # Orphan Records + # Orphan Records (AC 4.3) for fk in schema_structure.get("foreign_keys", []): from_table, from_col = fk.get("from_table"), fk.get("from_column") to_table, to_col = fk.get("to_table"), fk.get("to_column") + to_schema = fk.get("to_schema", schema_name) if from_table and from_col and to_table and to_col: fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" logger.info(f"Checking orphans for {fk_name}") from_full = f"[{schema_name}].[{from_table}]" - to_full = f"[{schema_name}].[{to_table}]" + to_full = f"[{to_schema}].[{to_table}]" orphan_q = f""" SELECT COUNT_BIG(s.[{from_col}]) as total_fk_values, @@ -78,19 +84,22 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, orphan_q)[0] - orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + total_fk_values = int(res['total_fk_values']) + orphan_count = int(res['orphan_count']) + orphan_pct = (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) except Exception as e: logger.error(f"Error checking orphans for {fk_name}: {e}") profile_results["orphan_records"][fk_name] = "Error" - # Type Anomalies - Heuristic for phone/zip + # Type Anomalies (AC 4.4) - Heuristic for phone/zip for table_name, table_info in tables.items(): full_table_name = f"[{schema_name}].[{table_name}]" for col_name, col_info in table_info.get("columns", {}).items(): col_type = col_info.get("type", "").lower() - if "char" in col_type or "text" in col_type: + if "char" in col_type or "text" in col_type or "varchar" in col_type: if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + # Regex for anything not a digit, hyphen, or period anomaly_q = f""" SELECT COUNT_BIG(*) as non_numeric_count FROM (SELECT TOP {sample_size} [{col_name}] FROM {full_table_name} WHERE [{col_name}] IS NOT NULL) as s @@ -98,11 +107,12 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, anomaly_q)[0] - if res['non_numeric_count'] > 0: + non_numeric_count = int(res['non_numeric_count']) + if non_numeric_count > 0: key = f"{table_name}.{col_name}" if key not in profile_results["type_anomalies"]: profile_results["type_anomalies"][key] = [] - profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + profile_results["type_anomalies"][key].append(f"Found {non_numeric_count} rows with non-numeric characters in sample.") except Exception as e: - logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") - return profile_results \ No newline at end of file + logger.warning(f"Error checking type anomaly for {full_table_name}.[{col_name}]: {e}") + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py index b6a0756..03904ec 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -1,13 +1,15 @@ import logging from typing import Dict, Any, List +import psycopg2 +from decimal import Decimal logger = logging.getLogger(__name__) def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: - """Executes a SQL query and returns results as a list of dicts.""" + """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() try: - conn.autocommit = True # Ensure no lingering transactions + conn.autocommit = True cursor.execute(query) if cursor.description: columns = [desc[0] for desc in cursor.description] @@ -25,20 +27,24 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st logger.info(f"Profiling table: {schema_name}.{table_name}") profile_results["nullability"][table_name] = {} profile_results["cardinality"][table_name] = {} + full_table_name = f'"{schema_name}"."{table_name}"' + # Nullability (AC 4.1) for col_name in table_info.get("columns", {}): null_q = f""" SELECT COUNT(*) as total_count, COUNT(*) - COUNT("{col_name}") as null_count - FROM (SELECT "{col_name}" FROM "{schema_name}"."{table_name}" LIMIT {sample_size}) as sampled; + FROM (SELECT "{col_name}" FROM {full_table_name} LIMIT {sample_size}) as sampled; """ try: res = _execute_query(conn, null_q)[0] - null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 + total_count = int(res['total_count']) + null_count = int(res['null_count']) + null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 profile_results["nullability"][table_name][col_name] = round(null_pct, 2) except Exception as e: - logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") + logger.error(f"Error profiling nulls for {full_table_name}.\"{col_name}\": {e}") profile_results["nullability"][table_name][col_name] = "Error" # Cardinality (AC 4.2) - PKs, FKs @@ -52,31 +58,36 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st for col_name in key_columns: if col_name in table_info.get("columns", {}): - card_q = f'SELECT COUNT(DISTINCT "{col_name}") as unique_count FROM "{schema_name}"."{table_name}";' + card_q = f'SELECT COUNT(DISTINCT "{col_name}") as unique_count FROM {full_table_name};' try: res = _execute_query(conn, card_q)[0] - profile_results["cardinality"][table_name][col_name] = res['unique_count'] + profile_results["cardinality"][table_name][col_name] = int(res['unique_count']) except Exception as e: - logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + logger.error(f"Error profiling cardinality for {full_table_name}.\"{col_name}\": {e}") profile_results["cardinality"][table_name][col_name] = "Error" # Orphan Records (AC 4.3) for fk in schema_structure.get("foreign_keys", []): from_table, from_col = fk.get("from_table"), fk.get("from_column") to_table, to_col = fk.get("to_table"), fk.get("to_column") + to_schema = fk.get("to_schema", schema_name) # Assume same schema if not specified if from_table and from_col and to_table and to_col: fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" logger.info(f"Checking orphans for {fk_name}") + from_full = f'"{schema_name}"."{from_table}"' + to_full = f'"{to_schema}"."{to_table}"' orphan_q = f""" SELECT COUNT(s."{from_col}") as total_fk_values, SUM(CASE WHEN t."{to_col}" IS NULL THEN 1 ELSE 0 END) as orphan_count - FROM (SELECT "{from_col}" FROM "{schema_name}"."{from_table}" WHERE "{from_col}" IS NOT NULL LIMIT {sample_size}) as s - LEFT JOIN "{schema_name}"."{to_table}" t ON s."{from_col}" = t."{to_col}"; + FROM (SELECT "{from_col}" FROM {from_full} WHERE "{from_col}" IS NOT NULL LIMIT {sample_size}) as s + LEFT JOIN {to_full} t ON s."{from_col}" = t."{to_col}"; """ try: res = _execute_query(conn, orphan_q)[0] - orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + total_fk_values = int(res['total_fk_values']) + orphan_count = int(res['orphan_count']) + orphan_pct = (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) except Exception as e: logger.error(f"Error checking orphans for {fk_name}: {e}") @@ -84,23 +95,26 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st # Type Anomalies (AC 4.4) - Heuristic for phone/zip for table_name, table_info in tables.items(): + full_table_name = f'"{schema_name}"."{table_name}"' for col_name, col_info in table_info.get("columns", {}).items(): col_type = col_info.get("type", "").lower() if "char" in col_type or "text" in col_type: if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + # Regex for anything not a digit, hyphen, or period anomaly_q = f""" SELECT COUNT(*) as non_numeric_count - FROM (SELECT "{col_name}" FROM "{schema_name}"."{table_name}" WHERE "{col_name}" IS NOT NULL LIMIT {sample_size}) as s + FROM (SELECT "{col_name}" FROM {full_table_name} WHERE "{col_name}" IS NOT NULL LIMIT {sample_size}) as s WHERE "{col_name}" ~ '[^0-9.-]'; """ try: res = _execute_query(conn, anomaly_q)[0] - if res['non_numeric_count'] > 0: + non_numeric_count = int(res['non_numeric_count']) + if non_numeric_count > 0: key = f"{table_name}.{col_name}" if key not in profile_results["type_anomalies"]: profile_results["type_anomalies"][key] = [] - profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + profile_results["type_anomalies"][key].append(f"Found {non_numeric_count} rows with non-numeric characters in sample.") except Exception as e: - logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") + logger.warning(f"Error checking type anomaly for {full_table_name}.\"{col_name}\": {e}") - return profile_results \ No newline at end of file + return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py index 19d2a60..6197ba4 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -2,7 +2,6 @@ import logging from typing import Dict, Any, List -# Import database connectors import psycopg2 import mysql.connector import pyodbc @@ -23,7 +22,7 @@ def _get_schemas(conn: Any, db_type: str) -> List[str]: schemas = [row[0] for row in cursor.fetchall()] elif db_type == "mysql": cursor.execute("SHOW DATABASES;") - # Filter out default mysql databases + # Filtering out default mysql databases default_dbs = {'information_schema', 'mysql', 'performance_schema', 'sys'} schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_dbs] elif db_type == "mssql": diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py deleted file mode 100644 index 02c597e..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py deleted file mode 100644 index 35e4061..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/agent.py +++ /dev/null @@ -1,43 +0,0 @@ -from google.adk.agents import LlmAgent -from google.adk.tools.agent_tool import AgentTool -from .tools import execute_sql_query -from .sub_agents.postgres_sql_agent.agent import postgres_sql_agent - -database_introspection_agent = LlmAgent( - name="database_introspection_agent", - model='gemini-2.5-flash', - description="Handles database interactions, including generating and executing SQL queries.", - instruction=""" - You are a Database Interaction Agent. Your tasks involve understanding user requests related to database operations, generating the appropriate SQL query using a specialized sub-agent, and executing the query. - - 1. **Understand Request:** Determine what the user wants to do with the database (e.g., select data, count rows, etc.). - - 2. **Check Connection:** Verify that a database connection is active by checking the session state. (You don't need a tool for this, just know it's a prerequisite). - - 3. **Generate SQL:** Use the appropriate sub-agent to generate the SQL query. Currently, only PostgreSQL is supported via `postgres_sql_agent`. - - Invoke `postgres_sql_agent` with the user's natural language request. - - 4. **Execute SQL:** Take the SQL query output from the sub-agent and use the `execute_sql_query` tool to run it against the database. - - 5. **Present Results:** Relay the results or status from the `execute_sql_query` tool back to the user in a clear and understandable way. - - If the result contains data, format it nicely. - - If it's an error, explain the error. - - **Example Flow:** - User: "How many customers do we have?" - You: (Recognize this needs a SQL query) - You: (Call `postgres_sql_agent` with "How many customers do we have?") - `postgres_sql_agent`: "SELECT COUNT(*) FROM customers;" - You: (Take the SQL string) - You: (Call `execute_sql_query` with sql_query="SELECT COUNT(*) FROM customers;") - `execute_sql_query`: Returns success with the count. - You: "There are [count] customers." - - **Constraint:** Only use the `postgres_sql_agent` for generating SQL. - **Note:** The `postgres_sql_agent` is specifically for PostgreSQL databases. - """, - tools=[ - AgentTool(agent=postgres_sql_agent), - execute_sql_query - ], -) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py deleted file mode 100644 index 02c597e..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py deleted file mode 100644 index 32a6655..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/sub_agents/postgres_sql_agent/agent.py +++ /dev/null @@ -1,20 +0,0 @@ -from google.adk.agents import LlmAgent - -postgres_sql_agent = LlmAgent( - name="postgres_sql_agent", - model='gemini-2.5-flash', - description="A specialized agent that generates PostgreSQL SQL queries based on natural language requests.", - instruction=""" - You are a PostgreSQL expert. Your task is to generate a single, executable PostgreSQL SQL query based on the user's request. - - Only output the SQL query. - - Do not include any explanations, backticks, or "SQL" markers, just the raw query. - - If the request is ambiguous, ask for clarification, but strive to generate a query if possible. - - Assume standard SQL and PostgreSQL syntax. - - Example Request: "Show me all users from the users table" - Example Output: SELECT * FROM users; - - Example Request: "Find the average age of employees" - Example Output: SELECT AVG(age) FROM employees; - """, -) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py deleted file mode 100644 index 04d71a5..0000000 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_introspection_agent/tools.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging -import json -import psycopg2 -from typing import Dict, Any -from google.adk.tools import ToolContext - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -async def execute_sql_query(sql_query: str, tool_context: ToolContext) -> Dict[str, Any]: - """Executes a read-only SQL SELECT query using stored PostgreSQL connection metadata. - - Args: - sql_query: The SQL SELECT statement to execute. - tool_context: Provides session state containing saved database metadata. - - Returns: - A dictionary with: - - result: JSON string containing query results or an error message. - """ - logger.info(f"Running SQL query: {sql_query}") - - # Ensure the query is read-only - if not sql_query.strip().lower().startswith("select"): - logger.warning("Only SELECT queries are allowed.") - return {"result": json.dumps({"error": "Only SELECT queries are allowed."})} - - # Retrieve stored connection metadata - db_conn = tool_context.state.get("db_connection") - if not db_conn or db_conn.get("status") != "connected": - logger.error("No valid database connection found.") - return {"result": json.dumps({"error": "Database not connected or inactive."})} - - metadata = db_conn.get("metadata") - if not metadata: - logger.error("Database metadata missing in session state.") - return {"result": json.dumps({"error": "Missing database metadata."})} - - try: - # Create a temporary connection for query execution - conn = psycopg2.connect( - host=metadata["host"], - port=metadata["port"], - dbname=metadata["dbname"], - user=metadata["user"], - password="postgres", - ) - - # log the connection object - logger.info(f"******* Connection object: {conn}") - - # Execute the query - with conn.cursor() as cursor: - cursor.execute(sql_query) - columns = [desc[0] for desc in cursor.description] - rows = cursor.fetchall() - result = [dict(zip(columns, row)) for row in rows] - - conn.close() - logger.info(f"Query executed successfully — rows returned: {len(result)}") - return {"result": json.dumps(result, default=str)} - - except Exception as e: - logger.error(f"SQL execution failed: {e}") - return {"result": json.dumps({"error": str(e)})} \ No newline at end of file diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index 6cc5b74..bf1fa27 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -1,6 +1,6 @@ import logging from typing import Dict, Any, List -import pyodbc +import psycopg2 import json import os import re @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") -# --- Copied LLM Client Setup & Helper Functions --- +# --- LLM Client Setup --- try: _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) @@ -20,11 +20,11 @@ GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") if not GOOGLE_CLOUD_PROJECT: - logger.warning("GOOGLE_CLOUD_PROJECT not set.") + logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") -MODEL = os.environ.get("MODEL", "gemini-1.5-pro") +MODEL = os.environ.get("MODEL", "gemini-2.5-pro") client = None if GOOGLE_CLOUD_PROJECT: @@ -34,14 +34,27 @@ project=GOOGLE_CLOUD_PROJECT, location=GOOGLE_CLOUD_LOCATION, ) - logger.info(f"GenAI Client initialized in mssql_utils.") + logger.info(f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") except Exception as e: - logger.error(f"Failed to initialize GenAI Client in mssql_utils: {e}") + logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") else: - logger.error("Cannot initialize GenAI Client in mssql_utils: GOOGLE_CLOUD_PROJECT is not set.") + logger.error("Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set.") -def _construct_llm_prompt(schema_name: str, db_type:str, schema_details: Dict[str, Any]) -> str: - # ... This function is IDENTICAL to the one in mysql_utils.py ... +def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: + """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" + cursor = conn.cursor() + try: + cursor.execute(query) + if cursor.description: + columns = [desc[0] for desc in cursor.description] + rows = cursor.fetchall() + return [dict(zip(columns, row)) for row in rows] + return [] + finally: + cursor.close() + +def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} for table_name, table_info in schema_details.get("tables", {}).items(): tables_context[table_name] = { @@ -116,11 +129,7 @@ def _extract_json_content(text: str) -> str: if not text: return "" match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) extracted = match.group(1).strip() if match else text.strip() - try: - parsed = json.loads(extracted) - return json.dumps(parsed, indent=4) - except json.JSONDecodeError: - return extracted + return extracted def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" @@ -132,16 +141,16 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") generated_text = "" try: - # logger.info(f"****** Custom_LLM_Request: {prompt}") + logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, contents=[types.Part.from_text(text=prompt)], config=types.GenerateContentConfig(response_mime_type="application/json"), ) generated_text = response.candidates[0].content.parts[0].text - # logger.info(f"****** Raw LLM Response: {generated_text}") + logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) - # logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") llm_output = json.loads(cleaned_json) inferred = llm_output.get("inferred_relationships", []) anomalies = llm_output.get("anomalies", []) @@ -155,75 +164,74 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, logger.error(f"Error calling LLM or processing response: {e}") return {"inferred_relationships": [], "anomalies": [{"error": f"LLM analysis failed: {e}"}]} except Exception as e: - logger.error(f"Unexpected error during LLM analysis: {e}") + logger.error(f"Unexpected error during LLM analysis: {e}", exc_info=True) return {"inferred_relationships": [], "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}]} -# --- End Copied LLM Functions --- -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: - """Executes a SQL query and returns results as a list of dicts for SQL Server.""" - cursor = conn.cursor() - try: - cursor.execute(query) - if cursor.description: - columns = [column[0] for column in cursor.description] - rows = cursor.fetchall() - return [dict(zip(columns, row)) for row in rows] - return [] - finally: - cursor.close() - -def get_mssql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: - logger.info(f"Fetching MSSQL schema details for: {schema_name}") +def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} + logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") - tables_query = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE';" + tables_query = f""" + SELECT table_name + FROM information_schema.tables + WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'; + """ tables = _execute_query(conn, tables_query) for table in tables: - t_name = table['TABLE_NAME'] + t_name = table['table_name'] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} - cols_query = f"SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}';" + cols_query = f""" + SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default + FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; + """ for col in _execute_query(conn, cols_query): - details["tables"][t_name]["columns"][col['COLUMN_NAME']] = {"type": col['DATA_TYPE'], "length": col['CHARACTER_MAXIMUM_LENGTH'], "precision": col['NUMERIC_PRECISION'], "scale": col['NUMERIC_SCALE'], "nullable": col['IS_NULLABLE'] == 'YES', "default": col['COLUMN_DEFAULT']} + details["tables"][t_name]["columns"][col['column_name']] = { + "type": col['data_type'], "length": col['character_maximum_length'], "precision": col['numeric_precision'], + "scale": col['numeric_scale'], "nullable": col['is_nullable'] == 'YES', "default": col['column_default'], + } constraints_query = f""" - SELECT KCU.TABLE_NAME, TC.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME, CC.CHECK_CLAUSE - FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC - LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME - LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA - WHERE TC.TABLE_SCHEMA = '{schema_name}' AND KCU.TABLE_NAME = '{t_name}'; + SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause + FROM information_schema.table_constraints tc + LEFT JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name + LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema + WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; """ details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) indexes_query = f""" - SELECT t.name AS table_name, ind.name AS index_name, COL_NAME(ic.object_id, ic.column_id) AS column_name, ind.is_unique - FROM sys.indexes ind INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id - INNER JOIN sys.tables t ON ind.object_id = t.object_id INNER JOIN sys.schemas s ON t.schema_id = s.schema_id - WHERE s.name = '{schema_name}' AND t.name = '{t_name}' AND ind.is_hypothetical = 0 AND ind.is_primary_key = 0 AND ind.type > 0; + SELECT + t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique + FROM pg_class t JOIN pg_index ix ON t.oid = ix.indrelid JOIN pg_class i ON i.oid = ix.indexrelid + LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) + JOIN pg_namespace n ON t.relnamespace = n.oid WHERE n.nspname = '{schema_name}' AND t.relname = '{t_name}' AND t.relkind = 'r'; """ try: indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: - idx_name = index['index_name'] - if not idx_name: continue - if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} - if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) + if index['column_name']: + idx_name = index['index_name'] + if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} + if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) - except Exception as e: logger.error(f"Error fetching MSSQL indexes for {t_name}: {e}") + except Exception as e: logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") fks_query = f""" - SELECT KCU1.CONSTRAINT_NAME AS fk_constraint_name, KCU1.TABLE_SCHEMA AS from_schema, KCU1.TABLE_NAME AS from_table, KCU1.COLUMN_NAME AS from_column, - KCU2.TABLE_SCHEMA AS to_schema, KCU2.TABLE_NAME AS to_table, KCU2.COLUMN_NAME AS to_column - FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS RC - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 ON KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME - JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 ON KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION - WHERE KCU1.TABLE_SCHEMA = '{schema_name}'; + SELECT + tc.constraint_name, tc.table_name AS from_table, kcu.column_name AS from_column, + ccu.table_schema AS to_schema, ccu.table_name AS to_table, ccu.column_name AS to_column + FROM information_schema.table_constraints AS tc JOIN information_schema.key_column_usage AS kcu + ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema + JOIN information_schema.constraint_column_usage AS ccu + ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema + WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = '{schema_name}'; """ details["foreign_keys"] = _execute_query(conn, fks_query) - views_query = f"SELECT TABLE_NAME AS view_name, VIEW_DEFINITION FROM INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA = '{schema_name}';" - for view in _execute_query(conn, views_query): details["views"][view['view_name']] = {"definition": view['VIEW_DEFINITION']} + views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" + details["views"] = {view['view_name']: {"definition": view['view_definition']} for view in _execute_query(conn, views_query)} - llm_analysis = _analyze_with_llm(schema_name, "Microsoft SQL Server", details) + llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) details["anomalies"] = llm_analysis.get("anomalies", []) - logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for MSSQL.") - logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for MSSQL.") + logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL.") + logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL.") return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py index af41bfb..1e3a2fc 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -138,7 +138,6 @@ def _extract_json_content(text: str) -> str: else: extracted = text.strip() - # Try to pretty format if valid JSON try: parsed = json.loads(extracted) return json.dumps(parsed, indent=4) @@ -158,17 +157,17 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") generated_text = "" try: - logger.info(f"****** Custom_LLM_Request: {prompt}") + logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, contents=[types.Part.from_text(text=prompt)], ) generated_text = response.candidates[0].content.parts[0].text - logger.info(f"****** Raw LLM Response: {generated_text}") + logger.debug(f"****** Raw LLM Response: {generated_text}") - # 🔹 Extract JSON content (handles ```json blocks) + # handles ```json blocks cleaned_json = _extract_json_content(generated_text) - logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") # Parse the cleaned JSON llm_output = json.loads(cleaned_json) @@ -275,8 +274,8 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships.") logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies.") - logger.info("************************") + logger.debug("************************") logger.info(details) - logger.info("************************") + logger.debug("************************") return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index 22ecc76..c938507 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -24,7 +24,7 @@ GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") -MODEL = os.environ.get("MODEL", "gemini-1.5-pro") +MODEL = os.environ.get("MODEL", "gemini-2.5-pro") client = None if GOOGLE_CLOUD_PROJECT: @@ -145,16 +145,16 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") generated_text = "" try: - # logger.info(f"****** Custom_LLM_Request: {prompt}") + logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, contents=[types.Part.from_text(text=prompt)], config=types.GenerateContentConfig(response_mime_type="application/json"), ) generated_text = response.candidates[0].content.parts[0].text - # logger.info(f"****** Raw LLM Response: {generated_text}") + logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) - # logger.info(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") llm_output = json.loads(cleaned_json) inferred = llm_output.get("inferred_relationships", []) anomalies = llm_output.get("anomalies", []) From 7a7dbb6cdacd6f754e010b756277a95fb09dc28a Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Wed, 19 Nov 2025 13:53:28 +0530 Subject: [PATCH 5/8] feat(data-model-discovery-agent): refactor code fix conflicts --- agent-app/app/prompt.py | 2 +- .../data_profiling_agent/utils/mssql_profiling_utils.py | 4 ---- .../data_profiling_agent/utils/postgres_profiling_utils.py | 4 ---- .../sub_agents/reporting_agent/tools.py | 1 - .../schema_introspection_agent/utils/mssql_utils.py | 1 - .../schema_introspection_agent/utils/postgresql_utils.py | 1 - 6 files changed, 1 insertion(+), 12 deletions(-) diff --git a/agent-app/app/prompt.py b/agent-app/app/prompt.py index 79d443a..4f19adb 100644 --- a/agent-app/app/prompt.py +++ b/agent-app/app/prompt.py @@ -27,4 +27,4 @@ 12. Use the 'application_portfolio_analyzer' agent to help the user with any application or server details related queries and to create an application portfolio report. 13. If the user asks about database discovery or database profiling please delegate the task to the following agent `data_model_discovery_agent`. 14. else use 'google_search_dummy_agent' - """ \ No newline at end of file + """ diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py index 80a3483..1c63136 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -28,7 +28,6 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, profile_results["cardinality"][table_name] = {} full_table_name = f"[{schema_name}].[{table_name}]" - # Nullability (AC 4.1) for col_name in table_info.get("columns", {}): null_q = f""" SELECT @@ -46,7 +45,6 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, logger.error(f"Error profiling nulls for {full_table_name}.[{col_name}]: {e}") profile_results["nullability"][table_name][col_name] = "Error" - # Cardinality (AC 4.2) - PKs, FKs key_columns = set() for const in table_info.get("constraints", []): if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): @@ -65,7 +63,6 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, logger.error(f"Error profiling cardinality for {full_table_name}.[{col_name}]: {e}") profile_results["cardinality"][table_name][col_name] = "Error" - # Orphan Records (AC 4.3) for fk in schema_structure.get("foreign_keys", []): from_table, from_col = fk.get("from_table"), fk.get("from_column") to_table, to_col = fk.get("to_table"), fk.get("to_column") @@ -92,7 +89,6 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, logger.error(f"Error checking orphans for {fk_name}: {e}") profile_results["orphan_records"][fk_name] = "Error" - # Type Anomalies (AC 4.4) - Heuristic for phone/zip for table_name, table_info in tables.items(): full_table_name = f"[{schema_name}].[{table_name}]" for col_name, col_info in table_info.get("columns", {}).items(): diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py index 03904ec..2cb915e 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -29,7 +29,6 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st profile_results["cardinality"][table_name] = {} full_table_name = f'"{schema_name}"."{table_name}"' - # Nullability (AC 4.1) for col_name in table_info.get("columns", {}): null_q = f""" SELECT @@ -47,7 +46,6 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st logger.error(f"Error profiling nulls for {full_table_name}.\"{col_name}\": {e}") profile_results["nullability"][table_name][col_name] = "Error" - # Cardinality (AC 4.2) - PKs, FKs key_columns = set() for const in table_info.get("constraints", []): if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): @@ -66,7 +64,6 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st logger.error(f"Error profiling cardinality for {full_table_name}.\"{col_name}\": {e}") profile_results["cardinality"][table_name][col_name] = "Error" - # Orphan Records (AC 4.3) for fk in schema_structure.get("foreign_keys", []): from_table, from_col = fk.get("from_table"), fk.get("from_column") to_table, to_col = fk.get("to_table"), fk.get("to_column") @@ -93,7 +90,6 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st logger.error(f"Error checking orphans for {fk_name}: {e}") profile_results["orphan_records"][fk_name] = "Error" - # Type Anomalies (AC 4.4) - Heuristic for phone/zip for table_name, table_info in tables.items(): full_table_name = f'"{schema_name}"."{table_name}"' for col_name, col_info in table_info.get("columns", {}).items(): diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py index 7b944a2..259fbda 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -115,7 +115,6 @@ def safe_encoder(obj): Handles Decimal, datetime, UUID, set, custom objects, etc. """ try: - # First try normal encoding return json.JSONEncoder().default(obj) except Exception: # Fallback: convert everything else to string diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index bf1fa27..d94992b 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -12,7 +12,6 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") -# --- LLM Client Setup --- try: _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index c938507..9f2a678 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -12,7 +12,6 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") -# --- LLM Client Setup --- try: _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) From 957b8dc4b8f4f72c9e3380020796bb2657c8bf89 Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Wed, 19 Nov 2025 14:20:52 +0530 Subject: [PATCH 6/8] feat(data-model-discovery-agent): fix linting issue --- agent-app/app/agent.py | 2 +- .../data_model_discovery_agent/__init__.py | 2 +- .../data_model_discovery_agent/agent.py | 33 +++-- .../sub_agents/data_profiling_agent/agent.py | 6 +- .../sub_agents/data_profiling_agent/tools.py | 69 +++++++--- .../utils/mssql_profiling_utils.py | 68 +++++++--- .../utils/mysql_profiling_utils.py | 64 ++++++--- .../utils/postgres_profiling_utils.py | 64 ++++++--- .../sub_agents/database_cred_agent/agent.py | 10 +- .../sub_agents/database_cred_agent/tools.py | 46 +++++-- .../sub_agents/qa_agent/agent.py | 25 ++-- .../sub_agents/reporting_agent/agent.py | 6 +- .../sub_agents/reporting_agent/tools.py | 88 +++++++++---- .../schema_introspection_agent/agent.py | 10 +- .../schema_introspection_agent/tools.py | 43 ++++-- .../utils/mssql_utils.py | 124 +++++++++++++----- .../utils/mysql_utils.py | 93 +++++++++---- .../utils/postgresql_utils.py | 122 ++++++++++++----- 18 files changed, 634 insertions(+), 241 deletions(-) diff --git a/agent-app/app/agent.py b/agent-app/app/agent.py index 63c581f..05774db 100644 --- a/agent-app/app/agent.py +++ b/agent-app/app/agent.py @@ -68,6 +68,6 @@ capability_mapper_agent, strategy_recommender_agent, detailed_architecture_design_agent, - data_model_discovery_agent + data_model_discovery_agent, ], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py index 7dbd57d..79406b4 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/__init__.py @@ -1 +1 @@ -from .agent import data_model_discovery_agent \ No newline at end of file +from .agent import data_model_discovery_agent diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py index 46681e7..b4da3c9 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + def root_agent_instruction(ctx: ReadonlyContext) -> str: """Dynamically builds the Root Agent's instruction based on session state.""" selected_schema = ctx.state.get("selected_schema") @@ -110,7 +111,9 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: """ if not db_connection or db_connection.get("status") != "connected": - return base_instruction + """ + return ( + base_instruction + + """ **Current State:** No active database connection. **Your Task:** @@ -140,8 +143,11 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: - Answer questions about your data and schema structure To do any of this, I'll first need to connect to your database. Just let me know when you want to proceed!" """ + ) elif available_schemas and not selected_schema: - return base_instruction + """ + return ( + base_instruction + + """ **Current Task:** The user has been presented with a list of available schemas by the `database_cred_agent`. Their current input is expected to be the name of the schema they wish to analyze. 1. Consider the user's entire input as the desired schema name. @@ -149,9 +155,12 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: - Example AgentTool Call: `schema_introspection_agent(user_input)` 3. The `schema_introspection_agent` will handle storing the selected schema and fetching the details. Await its response. """ + ) elif selected_schema and schema_structure: profile_status = "Completed" if data_profile else "Not Yet Run" - return base_instruction + f""" + return ( + base_instruction + + f""" **Current Context:** The database is connected. The schema '{selected_schema}' has been successfully introspected. Data Quality Profile Status: {profile_status} @@ -171,20 +180,28 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: If the user's intent is unclear, ask for clarification. You can remind them of the available actions. """ + ) elif selected_schema and not schema_structure: - return base_instruction + f""" + return ( + base_instruction + + f""" **Current Context:** The schema '{selected_schema}' was selected, but the introspection data is missing or incomplete. - Recall `schema_introspection_agent` and pass the schema name '{selected_schema}' as the input to it to ensure the structure is loaded. - Example AgentTool Call: `schema_introspection_agent("{selected_schema}")` """ + ) else: - return base_instruction + """ + return ( + base_instruction + + """ **Current Task:** Determine the next step based on the conversation history and session state. If unsure, ask the user for clarification. """ + ) + data_model_discovery_agent = LlmAgent( - model='gemini-2.5-flash', - name='data_model_discovery_agent', + model="gemini-2.5-flash", + name="data_model_discovery_agent", description=( "A helpful root agent that orchestrates sub-agents to introspect and profile legacy databases." ), @@ -195,5 +212,5 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: qa_agent, data_profiling_agent, reporting_agent, - ] + ], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py index 530743c..7b3c499 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py @@ -3,9 +3,9 @@ from ..qa_agent.agent import qa_agent data_profiling_agent = LlmAgent( - model='gemini-2.5-flash', - name='data_profiling_agent', - description='Profiles data quality for the selected schema and then calls QA agent to summarize.', + model="gemini-2.5-flash", + name="data_profiling_agent", + description="Profiles data quality for the selected schema and then calls QA agent to summarize.", instruction=""" ### Role You are a **Data Profiling Agent**. Your sole responsibility is to run data profiling on a schema and then immediately hand off the summary of findings to the QA agent for user-facing reporting. diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py index d91675c..d07dcab 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -4,29 +4,43 @@ import psycopg2 import mysql.connector import pyodbc -from .utils import postgres_profiling_utils, mysql_profiling_utils, mssql_profiling_utils +from .utils import ( + postgres_profiling_utils, + mysql_profiling_utils, + mssql_profiling_utils, +) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: db_type = metadata.get("db_type") host = metadata.get("host") port = int(metadata.get("port")) dbname = metadata.get("dbname") user = metadata.get("user") - logger.info(f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}") + logger.info( + f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}" + ) if db_type == "postgresql": - return psycopg2.connect(host=host, port=port, dbname=dbname, user=user, password=password) + return psycopg2.connect( + host=host, port=port, dbname=dbname, user=user, password=password + ) elif db_type == "mysql": - return mysql.connector.connect(host=host, port=port, database=dbname, user=user, password=password) + return mysql.connector.connect( + host=host, port=port, database=dbname, user=user, password=password + ) elif db_type == "mssql": conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" return pyodbc.connect(conn_str) else: raise ValueError(f"Unsupported database type: {db_type}") -async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + +async def profile_schema_data( + tool_context: ToolContext, args: Dict[str, Any] +) -> Dict[str, Any]: """ Profiles the data in the selected schema based on the schema structure. Calculates nullability, cardinality, orphan records, and type anomalies. @@ -41,10 +55,14 @@ async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any]) - schema_structure = tool_context.state.get("schema_structure") sample_size = args.get("sample_size", 10000) - if not db_conn_state or db_conn_state.get("status") != "connected": return {"error": "DB not connected."} - if not db_creds: return {"error": "DB credentials not found."} - if not schema_name: return {"error": "Selected schema not found."} - if not schema_structure: return {"error": "Schema structure not found. Please run introspection first."} + if not db_conn_state or db_conn_state.get("status") != "connected": + return {"error": "DB not connected."} + if not db_creds: + return {"error": "DB credentials not found."} + if not schema_name: + return {"error": "Selected schema not found."} + if not schema_structure: + return {"error": "Schema structure not found. Please run introspection first."} metadata = db_conn_state["metadata"] password = db_creds["password"] @@ -53,20 +71,30 @@ async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any]) - conn = None try: conn = _get_db_connection(metadata, password) - logger.info(f"Reconnected to {db_type} for data profiling of schema '{schema_name}'.") + logger.info( + f"Reconnected to {db_type} for data profiling of schema '{schema_name}'." + ) if db_type == "postgresql": - profile_results = postgres_profiling_utils.profile_postgres_data(conn, schema_name, schema_structure, sample_size) + profile_results = postgres_profiling_utils.profile_postgres_data( + conn, schema_name, schema_structure, sample_size + ) elif db_type == "mysql": - profile_results = mysql_profiling_utils.profile_mysql_data(conn, schema_name, schema_structure, sample_size) + profile_results = mysql_profiling_utils.profile_mysql_data( + conn, schema_name, schema_structure, sample_size + ) elif db_type == "mssql": - profile_results = mssql_profiling_utils.profile_mssql_data(conn, schema_name, schema_structure, sample_size) + profile_results = mssql_profiling_utils.profile_mssql_data( + conn, schema_name, schema_structure, sample_size + ) else: return {"error": f"Profiling for {db_type} not implemented."} tool_context.state["data_profile"] = profile_results - tool_context.state["profiling_just_completed"] = True # Set the flag - logger.info(f"Data profiling results for '{schema_name}' saved to session state.") + tool_context.state["profiling_just_completed"] = True # Set the flag + logger.info( + f"Data profiling results for '{schema_name}' saved to session state." + ) return { "status": "success", @@ -75,9 +103,12 @@ async def profile_schema_data(tool_context: ToolContext, args: Dict[str, Any]) - } except Exception as e: logger.error(f"Error during data profiling: {e}", exc_info=True) - return {"error": f"Failed to profile data for {db_type} ({schema_name}): {str(e)}"} + return { + "error": f"Failed to profile data for {db_type} ({schema_name}): {str(e)}" + } finally: if conn: - try: conn.close() - except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") - \ No newline at end of file + try: + conn.close() + except Exception as e: + logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py index 1c63136..6e51138 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -5,6 +5,7 @@ logger = logging.getLogger(__name__) + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for SQL Server.""" cursor = conn.cursor() @@ -18,8 +19,19 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: - profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + +def profile_mssql_data( + conn: Any, + schema_name: str, + schema_structure: Dict[str, Any], + sample_size: int = 10000, +) -> Dict[str, Any]: + profile_results = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } tables = schema_structure.get("tables", {}) for table_name, table_info in tables.items(): @@ -37,30 +49,38 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, null_q)[0] - total_count = int(res['total_count']) - null_count = int(res['null_count']) + total_count = int(res["total_count"]) + null_count = int(res["null_count"]) null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 - profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) except Exception as e: - logger.error(f"Error profiling nulls for {full_table_name}.[{col_name}]: {e}") + logger.error( + f"Error profiling nulls for {full_table_name}.[{col_name}]: {e}" + ) profile_results["nullability"][table_name][col_name] = "Error" key_columns = set() for const in table_info.get("constraints", []): if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): - key_columns.add(const["columns"]) + key_columns.add(const["columns"]) for fk in schema_structure.get("foreign_keys", []): if fk.get("from_table") == table_name and fk.get("from_column"): key_columns.add(fk["from_column"]) for col_name in key_columns: - if col_name in table_info.get("columns", {}): + if col_name in table_info.get("columns", {}): card_q = f"SELECT COUNT(DISTINCT [{col_name}]) as unique_count FROM {full_table_name};" try: res = _execute_query(conn, card_q)[0] - profile_results["cardinality"][table_name][col_name] = int(res['unique_count']) + profile_results["cardinality"][table_name][col_name] = int( + res["unique_count"] + ) except Exception as e: - logger.error(f"Error profiling cardinality for {full_table_name}.[{col_name}]: {e}") + logger.error( + f"Error profiling cardinality for {full_table_name}.[{col_name}]: {e}" + ) profile_results["cardinality"][table_name][col_name] = "Error" for fk in schema_structure.get("foreign_keys", []): @@ -81,9 +101,11 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, orphan_q)[0] - total_fk_values = int(res['total_fk_values']) - orphan_count = int(res['orphan_count']) - orphan_pct = (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + total_fk_values = int(res["total_fk_values"]) + orphan_count = int(res["orphan_count"]) + orphan_pct = ( + (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + ) profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) except Exception as e: logger.error(f"Error checking orphans for {fk_name}: {e}") @@ -92,9 +114,13 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, for table_name, table_info in tables.items(): full_table_name = f"[{schema_name}].[{table_name}]" for col_name, col_info in table_info.get("columns", {}).items(): - col_type = col_info.get("type", "").lower() - if "char" in col_type or "text" in col_type or "varchar" in col_type: - if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type or "varchar" in col_type: + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): # Regex for anything not a digit, hyphen, or period anomaly_q = f""" SELECT COUNT_BIG(*) as non_numeric_count @@ -103,12 +129,16 @@ def profile_mssql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, anomaly_q)[0] - non_numeric_count = int(res['non_numeric_count']) + non_numeric_count = int(res["non_numeric_count"]) if non_numeric_count > 0: key = f"{table_name}.{col_name}" if key not in profile_results["type_anomalies"]: profile_results["type_anomalies"][key] = [] - profile_results["type_anomalies"][key].append(f"Found {non_numeric_count} rows with non-numeric characters in sample.") + profile_results["type_anomalies"][key].append( + f"Found {non_numeric_count} rows with non-numeric characters in sample." + ) except Exception as e: - logger.warning(f"Error checking type anomaly for {full_table_name}.[{col_name}]: {e}") + logger.warning( + f"Error checking type anomaly for {full_table_name}.[{col_name}]: {e}" + ) return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py index c251a95..994e6aa 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -3,6 +3,7 @@ logger = logging.getLogger(__name__) + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: cursor = conn.cursor(dictionary=True) try: @@ -11,14 +12,25 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: + +def profile_mysql_data( + conn: Any, + schema_name: str, + schema_structure: Dict[str, Any], + sample_size: int = 10000, +) -> Dict[str, Any]: try: conn.database = schema_name except Exception as e: logger.error(f"Failed to set database {schema_name}: {e}") raise - profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + profile_results = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } tables = schema_structure.get("tables", {}) for table_name, table_info in tables.items(): @@ -35,8 +47,14 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, null_q)[0] - null_pct = (res['null_count'] / res['total_count']) * 100 if res['total_count'] > 0 else 0 - profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + null_pct = ( + (res["null_count"] / res["total_count"]) * 100 + if res["total_count"] > 0 + else 0 + ) + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) except Exception as e: logger.error(f"Error profiling nulls for {table_name}.{col_name}: {e}") profile_results["nullability"][table_name][col_name] = "Error" @@ -45,19 +63,23 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, key_columns = set() for const in table_info.get("constraints", []): if const.get("type") in ("PRIMARY KEY", "UNIQUE") and const.get("columns"): - key_columns.add(const["columns"]) + key_columns.add(const["columns"]) for fk in schema_structure.get("foreign_keys", []): if fk.get("from_table") == table_name and fk.get("from_column"): key_columns.add(fk["from_column"]) for col_name in key_columns: - if col_name in table_info.get("columns", {}): + if col_name in table_info.get("columns", {}): card_q = f"SELECT COUNT(DISTINCT `{col_name}`) as unique_count FROM `{table_name}`;" try: res = _execute_query(conn, card_q)[0] - profile_results["cardinality"][table_name][col_name] = res['unique_count'] + profile_results["cardinality"][table_name][col_name] = res[ + "unique_count" + ] except Exception as e: - logger.error(f"Error profiling cardinality for {table_name}.{col_name}: {e}") + logger.error( + f"Error profiling cardinality for {table_name}.{col_name}: {e}" + ) profile_results["cardinality"][table_name][col_name] = "Error" # Orphan Records @@ -76,7 +98,11 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, orphan_q)[0] - orphan_pct = (res['orphan_count'] / res['total_fk_values']) * 100 if res['total_fk_values'] > 0 else 0 + orphan_pct = ( + (res["orphan_count"] / res["total_fk_values"]) * 100 + if res["total_fk_values"] > 0 + else 0 + ) profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) except Exception as e: logger.error(f"Error checking orphans for {fk_name}: {e}") @@ -85,9 +111,13 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, # Type Anomalies - Heuristic for phone/zip for table_name, table_info in tables.items(): for col_name, col_info in table_info.get("columns", {}).items(): - col_type = col_info.get("type", "").lower() - if "char" in col_type or "text" in col_type: - if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + col_type = col_info.get("type", "").lower() + if "char" in col_type or "text" in col_type: + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): anomaly_q = f""" SELECT COUNT(*) as non_numeric_count FROM (SELECT `{col_name}` FROM `{table_name}` WHERE `{col_name}` IS NOT NULL LIMIT {sample_size}) as s @@ -95,11 +125,15 @@ def profile_mysql_data(conn: Any, schema_name: str, schema_structure: Dict[str, """ try: res = _execute_query(conn, anomaly_q)[0] - if res['non_numeric_count'] > 0: + if res["non_numeric_count"] > 0: key = f"{table_name}.{col_name}" if key not in profile_results["type_anomalies"]: profile_results["type_anomalies"][key] = [] - profile_results["type_anomalies"][key].append(f"Found {res['non_numeric_count']} rows with non-numeric characters in sample.") + profile_results["type_anomalies"][key].append( + f"Found {res['non_numeric_count']} rows with non-numeric characters in sample." + ) except Exception as e: - logger.warning(f"Error checking type anomaly for {table_name}.{col_name}: {e}") + logger.warning( + f"Error checking type anomaly for {table_name}.{col_name}: {e}" + ) return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py index 2cb915e..992b814 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -5,6 +5,7 @@ logger = logging.getLogger(__name__) + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() @@ -19,8 +20,19 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[str, Any], sample_size: int = 10000) -> Dict[str, Any]: - profile_results = {"nullability": {}, "cardinality": {}, "orphan_records": {}, "type_anomalies": {}} + +def profile_postgres_data( + conn: Any, + schema_name: str, + schema_structure: Dict[str, Any], + sample_size: int = 10000, +) -> Dict[str, Any]: + profile_results = { + "nullability": {}, + "cardinality": {}, + "orphan_records": {}, + "type_anomalies": {}, + } tables = schema_structure.get("tables", {}) for table_name, table_info in tables.items(): @@ -38,12 +50,16 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st """ try: res = _execute_query(conn, null_q)[0] - total_count = int(res['total_count']) - null_count = int(res['null_count']) + total_count = int(res["total_count"]) + null_count = int(res["null_count"]) null_pct = (null_count / total_count) * 100 if total_count > 0 else 0 - profile_results["nullability"][table_name][col_name] = round(null_pct, 2) + profile_results["nullability"][table_name][col_name] = round( + null_pct, 2 + ) except Exception as e: - logger.error(f"Error profiling nulls for {full_table_name}.\"{col_name}\": {e}") + logger.error( + f'Error profiling nulls for {full_table_name}."{col_name}": {e}' + ) profile_results["nullability"][table_name][col_name] = "Error" key_columns = set() @@ -59,15 +75,21 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st card_q = f'SELECT COUNT(DISTINCT "{col_name}") as unique_count FROM {full_table_name};' try: res = _execute_query(conn, card_q)[0] - profile_results["cardinality"][table_name][col_name] = int(res['unique_count']) + profile_results["cardinality"][table_name][col_name] = int( + res["unique_count"] + ) except Exception as e: - logger.error(f"Error profiling cardinality for {full_table_name}.\"{col_name}\": {e}") + logger.error( + f'Error profiling cardinality for {full_table_name}."{col_name}": {e}' + ) profile_results["cardinality"][table_name][col_name] = "Error" for fk in schema_structure.get("foreign_keys", []): from_table, from_col = fk.get("from_table"), fk.get("from_column") to_table, to_col = fk.get("to_table"), fk.get("to_column") - to_schema = fk.get("to_schema", schema_name) # Assume same schema if not specified + to_schema = fk.get( + "to_schema", schema_name + ) # Assume same schema if not specified if from_table and from_col and to_table and to_col: fk_name = f"{from_table}.{from_col} -> {to_table}.{to_col}" logger.info(f"Checking orphans for {fk_name}") @@ -82,9 +104,11 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st """ try: res = _execute_query(conn, orphan_q)[0] - total_fk_values = int(res['total_fk_values']) - orphan_count = int(res['orphan_count']) - orphan_pct = (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + total_fk_values = int(res["total_fk_values"]) + orphan_count = int(res["orphan_count"]) + orphan_pct = ( + (orphan_count / total_fk_values) * 100 if total_fk_values > 0 else 0 + ) profile_results["orphan_records"][fk_name] = round(orphan_pct, 2) except Exception as e: logger.error(f"Error checking orphans for {fk_name}: {e}") @@ -95,7 +119,11 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st for col_name, col_info in table_info.get("columns", {}).items(): col_type = col_info.get("type", "").lower() if "char" in col_type or "text" in col_type: - if "phone" in col_name.lower() or "zip" in col_name.lower() or "postal" in col_name.lower(): + if ( + "phone" in col_name.lower() + or "zip" in col_name.lower() + or "postal" in col_name.lower() + ): # Regex for anything not a digit, hyphen, or period anomaly_q = f""" SELECT COUNT(*) as non_numeric_count @@ -104,13 +132,17 @@ def profile_postgres_data(conn: Any, schema_name: str, schema_structure: Dict[st """ try: res = _execute_query(conn, anomaly_q)[0] - non_numeric_count = int(res['non_numeric_count']) + non_numeric_count = int(res["non_numeric_count"]) if non_numeric_count > 0: key = f"{table_name}.{col_name}" if key not in profile_results["type_anomalies"]: profile_results["type_anomalies"][key] = [] - profile_results["type_anomalies"][key].append(f"Found {non_numeric_count} rows with non-numeric characters in sample.") + profile_results["type_anomalies"][key].append( + f"Found {non_numeric_count} rows with non-numeric characters in sample." + ) except Exception as e: - logger.warning(f"Error checking type anomaly for {full_table_name}.\"{col_name}\": {e}") + logger.warning( + f'Error checking type anomaly for {full_table_name}."{col_name}": {e}' + ) return profile_results diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py index ab0a05e..e7dd8a8 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py @@ -3,9 +3,9 @@ database_cred_agent = LlmAgent( - model='gemini-2.5-flash', - name='database_cred_agent', - description='A helpful assistant that collects and validates database connection details, and lists available schemas.', + model="gemini-2.5-flash", + name="database_cred_agent", + description="A helpful assistant that collects and validates database connection details, and lists available schemas.", instruction=""" ### Role You are a helpful assistant responsible for gathering, validating, and confirming database connection details from the user, then listing the available schemas for selection. Your responses containing lists of schemas MUST be in raw Markdown format. @@ -69,7 +69,5 @@ - Do not assume or confirm which schema the user will select. Your task ends after presenting the list of schemas and asking the user to choose. - If the user asks for database connection details, you may display the host, port, and database name, but you must **never** reveal the password or any sensitive credentials. """, - tools=[ - validate_db_connection - ], + tools=[validate_db_connection], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py index 6197ba4..a117ab2 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + def _get_schemas(conn: Any, db_type: str) -> List[str]: """Fetches list of schemas/databases based on db type.""" schemas = [] @@ -23,22 +24,36 @@ def _get_schemas(conn: Any, db_type: str) -> List[str]: elif db_type == "mysql": cursor.execute("SHOW DATABASES;") # Filtering out default mysql databases - default_dbs = {'information_schema', 'mysql', 'performance_schema', 'sys'} + default_dbs = {"information_schema", "mysql", "performance_schema", "sys"} schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_dbs] elif db_type == "mssql": cursor.execute("SELECT name FROM sys.schemas;") - # Filter out default mssql schemas + # Filter out default mssql schemas default_schemas = { - 'db_accessadmin', 'db_backupoperator', 'db_datareader', 'db_datawriter', - 'db_ddladmin', 'db_denydatareader', 'db_denydatawriter', 'db_owner', - 'db_securityadmin', 'guest', 'INFORMATION_SCHEMA', 'sys' + "db_accessadmin", + "db_backupoperator", + "db_datareader", + "db_datawriter", + "db_ddladmin", + "db_denydatareader", + "db_denydatawriter", + "db_owner", + "db_securityadmin", + "guest", + "INFORMATION_SCHEMA", + "sys", } - schemas = [row[0] for row in cursor.fetchall() if row[0] not in default_schemas] + schemas = [ + row[0] for row in cursor.fetchall() if row[0] not in default_schemas + ] finally: cursor.close() return schemas -async def validate_db_connection(connection_details: Dict[str, Any], tool_context: ToolContext) -> Dict[str, Any]: + +async def validate_db_connection( + connection_details: Dict[str, Any], tool_context: ToolContext +) -> Dict[str, Any]: """Validates a database connection for PostgreSQL, MySQL, or MSSQL, fetches available schemas, and saves metadata to session memory. @@ -96,7 +111,9 @@ async def validate_db_connection(connection_details: Dict[str, Any], tool_contex logger.error(error_msg) return {"status": "error", "message": error_msg} - logger.info(f"{db_type.upper()} connection established successfully for validation.") + logger.info( + f"{db_type.upper()} connection established successfully for validation." + ) # Fetch schemas schemas = _get_schemas(conn, db_type) @@ -106,7 +123,7 @@ async def validate_db_connection(connection_details: Dict[str, Any], tool_contex if "db_connection" in tool_context.state: del tool_context.state["db_connection"] if "db_creds_temp" in tool_context.state: - del tool_context.state["db_creds_temp"] + del tool_context.state["db_creds_temp"] if "selected_schema" in tool_context.state: del tool_context.state["selected_schema"] @@ -120,13 +137,15 @@ async def validate_db_connection(connection_details: Dict[str, Any], tool_contex }, "status": "connected", } - tool_context.state["db_creds_temp"] = {"password": connection_details["password"]} + tool_context.state["db_creds_temp"] = { + "password": connection_details["password"] + } logger.info("Connection metadata saved in session memory.") return { "status": "success", "message": f"{db_type.upper()} connection validated successfully.", - "schemas": schemas + "schemas": schemas, } except Exception as e: @@ -135,4 +154,7 @@ async def validate_db_connection(connection_details: Dict[str, Any], tool_contex del tool_context.state["db_connection"] if "db_creds_temp" in tool_context.state: del tool_context.state["db_creds_temp"] - return {"status": "error", "message": f"Connection/Schema fetch failed for {db_type}: {e}"} + return { + "status": "error", + "message": f"Connection/Schema fetch failed for {db_type}: {e}", + } diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py index ecc778d..8a00f0d 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py @@ -3,11 +3,13 @@ from google.adk.agents.llm_agent import LlmAgent from google.adk.agents.readonly_context import ReadonlyContext + def json_encoder_default(obj): if isinstance(obj, Decimal): return str(obj) raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") + def qa_agent_instruction(ctx: ReadonlyContext) -> str: """Builds the QA agent's instruction for schema and data profiling queries.""" @@ -28,7 +30,9 @@ def qa_agent_instruction(ctx: ReadonlyContext) -> str: """ try: - schema_json = json.dumps(schema_structure, indent=2, default=json_encoder_default) + schema_json = json.dumps( + schema_structure, indent=2, default=json_encoder_default + ) except Exception as e: schema_json = f"Error serializing schema structure: {e}" @@ -41,11 +45,15 @@ def qa_agent_instruction(ctx: ReadonlyContext) -> str: "Nullability": data_profile.get("nullability", "Not available"), "Cardinality": data_profile.get("cardinality", "Not available"), "Orphan Records": data_profile.get("orphan_records", "Not available"), - "Type Anomalies": data_profile.get("type_anomalies", "Not available") + "Type Anomalies": data_profile.get("type_anomalies", "Not available"), } - profile_message = json.dumps(profile_summary, indent=2, default=json_encoder_default) + profile_message = json.dumps( + profile_summary, indent=2, default=json_encoder_default + ) except Exception: - profile_message = "Data profiling results exist but could not be summarized." + profile_message = ( + "Data profiling results exist but could not be summarized." + ) else: profile_message = ( "Data profiling has not been run yet. " @@ -89,10 +97,11 @@ def qa_agent_instruction(ctx: ReadonlyContext) -> str: Always respond in clear, human-readable sentences. If profiling data is missing, offer to run profiling on a sample of up to 10,000 rows to provide a summary. """ + qa_agent = LlmAgent( - model='gemini-2.5-flash', - name='qa_agent', - description='Answers natural language questions about the discovered database schema structure and data profiling results.', + model="gemini-2.5-flash", + name="qa_agent", + description="Answers natural language questions about the discovered database schema structure and data profiling results.", instruction=qa_agent_instruction, - tools=[] + tools=[], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py index 7b38717..79c0d1a 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py @@ -2,9 +2,9 @@ from .tools import generate_summary_report, export_full_report, generate_erd_script reporting_agent = LlmAgent( - model='gemini-2.5-flash', - name='reporting_agent', - description='Generates reports, exports data, and creates schema diagrams.', + model="gemini-2.5-flash", + name="reporting_agent", + description="Generates reports, exports data, and creates schema diagrams.", instruction=""" ### Role You are a Reporting Agent. You generate human-readable summaries, export detailed data, and create scripts for schema visualizations based on the analysis performed by other agents. diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py index 259fbda..f809c94 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -7,7 +7,10 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + +async def generate_summary_report( + tool_context: ToolContext, args: Dict[str, Any] +) -> Dict[str, Any]: """ Generates a high-level summary report of the database analysis. @@ -39,9 +42,14 @@ async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any "tables": len(schema_structure.get("tables", {})), "views": len(schema_structure.get("views", {})), "explicit_fks": len(schema_structure.get("foreign_keys", [])), - "inferred_relationships": len(schema_structure.get("inferred_relationships", [])), + "inferred_relationships": len( + schema_structure.get("inferred_relationships", []) + ), "schema_anomalies": len(schema_structure.get("anomalies", [])), - "columns": sum(len(t.get("columns", {})) for t in schema_structure.get("tables", {}).values()), + "columns": sum( + len(t.get("columns", {})) + for t in schema_structure.get("tables", {}).values() + ), } report = f"### Data Discovery Summary for Schema: {selected_schema}\n\n" @@ -50,28 +58,43 @@ async def generate_summary_report(tool_context: ToolContext, args: Dict[str, Any report += f"- Total Columns: {summary['columns']}\n" report += f"- Views Found: {summary['views']}\n" report += f"- Explicit Foreign Keys: {summary['explicit_fks']}\n" - report += f"- Potential Inferred Relationships: {summary['inferred_relationships']}\n" + report += ( + f"- Potential Inferred Relationships: {summary['inferred_relationships']}\n" + ) report += f"- Schema Anomalies Detected: {summary['schema_anomalies']}\n\n" if data_profile: report += "**Data Quality Profile Highlights:**\n" - null_issues = sum(1 for table in data_profile.get("nullability", {}).values() for null_pct in table.values() if isinstance(null_pct, (int, float)) and null_pct > 50) - orphan_issues = sum(1 for orphan_pct in data_profile.get("orphan_records", {}).values() if isinstance(orphan_pct, (int, float)) and orphan_pct > 10) + null_issues = sum( + 1 + for table in data_profile.get("nullability", {}).values() + for null_pct in table.values() + if isinstance(null_pct, (int, float)) and null_pct > 50 + ) + orphan_issues = sum( + 1 + for orphan_pct in data_profile.get("orphan_records", {}).values() + if isinstance(orphan_pct, (int, float)) and orphan_pct > 10 + ) type_anomalies = len(data_profile.get("type_anomalies", {})) report += f"- Columns with >50% NULLs: {null_issues} (in sampled data)\n" - report += f"- FKs with >10% Orphan Records: {orphan_issues} (in sampled data)\n" + report += ( + f"- FKs with >10% Orphan Records: {orphan_issues} (in sampled data)\n" + ) report += f"- Columns with Potential Type Anomalies: {type_anomalies} (in sampled data)\n" else: report += "**Data Quality Profile:** Not yet run.\n" return {"status": "success", "report_text": report} + import json import logging logger = logging.getLogger(__name__) + async def export_full_report(tool_context: ToolContext, args: dict) -> dict: """ Exports the full schema structure and data profile as a clean JSON report. @@ -92,21 +115,30 @@ async def export_full_report(tool_context: ToolContext, args: dict) -> dict: } """ if not isinstance(args, dict): - return {"status": "error", "error": "Invalid arguments. Expected a dictionary for args."} + return { + "status": "error", + "error": "Invalid arguments. Expected a dictionary for args.", + } schema_structure = tool_context.state.get("schema_structure") data_profile = tool_context.state.get("data_profile") if not schema_structure: - return {"status": "error", "error": "Schema structure not found. Please run introspection first."} + return { + "status": "error", + "error": "Schema structure not found. Please run introspection first.", + } requested_format = args.get("format", "json").lower() if requested_format != "json": - return {"status": "error", "error": f"Unsupported format '{requested_format}'. Only JSON is supported."} + return { + "status": "error", + "error": f"Unsupported format '{requested_format}'. Only JSON is supported.", + } full_report = { "schema_structure": schema_structure, - "data_profile": data_profile or "Not run" + "data_profile": data_profile or "Not run", } def safe_encoder(obj): @@ -122,17 +154,14 @@ def safe_encoder(obj): try: json_output = json.dumps( - full_report, - indent=2, - ensure_ascii=False, - default=safe_encoder + full_report, indent=2, ensure_ascii=False, default=safe_encoder ) return { "status": "success", "message": "Full report generated in JSON format. You can copy the content below.", "report_content": json_output, - "format": "JSON" + "format": "JSON", } except Exception as e: @@ -140,7 +169,9 @@ def safe_encoder(obj): return {"status": "error", "error": f"Failed to generate JSON report: {str(e)}"} -async def generate_erd_script(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: +async def generate_erd_script( + tool_context: ToolContext, args: Dict[str, Any] +) -> Dict[str, Any]: """ Generates a complete, valid Mermaid ER Diagram script. @@ -171,14 +202,14 @@ async def generate_erd_script(tool_context: ToolContext, args: Dict[str, Any]) - if not isinstance(args, dict): return { "status": "error", - "error": "Invalid arguments. Expected a dictionary for args." + "error": "Invalid arguments. Expected a dictionary for args.", } schema_structure = tool_context.state.get("schema_structure") if not schema_structure: return { "status": "error", - "error": "Schema structure not found. Please run introspection first." + "error": "Schema structure not found. Please run introspection first.", } def sanitize_datatype(dtype: str) -> str: @@ -198,9 +229,16 @@ def sanitize_datatype(dtype: str) -> str: return "enum" if "timestamp" in dtype: return "timestamp" - return dtype.replace("(", "").replace(")", "").replace(",", "").replace(" ", "_") + return ( + dtype.replace("(", "").replace(")", "").replace(",", "").replace(" ", "_") + ) - def format_column(table_name: str, col_name: str, col_info: Dict[str, Any], constraints_info: List[Dict[str, Any]]) -> str: + def format_column( + table_name: str, + col_name: str, + col_info: Dict[str, Any], + constraints_info: List[Dict[str, Any]], + ) -> str: """Format a column entry with proper constraints for Mermaid.""" dtype = sanitize_datatype(col_info.get("type", "text")) constraints = [] @@ -238,7 +276,9 @@ def format_column(table_name: str, col_name: str, col_info: Dict[str, Any], cons constraints_info = table_info.get("constraints", []) for col_name, col_info in columns.items(): - lines.append(format_column(table_name, col_name, col_info, constraints_info)) + lines.append( + format_column(table_name, col_name, col_info, constraints_info) + ) lines.append(" }") lines.append("") @@ -273,5 +313,5 @@ def format_column(table_name: str, col_name: str, col_info: Dict[str, Any], cons "status": "success", "message": "Mermaid ERD script generated successfully. Paste this code into any Mermaid renderer.", "script_type": "Mermaid", - "script": mermaid_script - } \ No newline at end of file + "script": mermaid_script, + } diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py index 846b8ef..7215214 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py @@ -3,9 +3,9 @@ import json schema_introspection_agent = LlmAgent( - model='gemini-2.5-flash', - name='schema_introspection_agent', - description='Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.', + model="gemini-2.5-flash", + name="schema_introspection_agent", + description="Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.", instruction=""" ### Role You are a **Database Schema Introspection Agent**. Your sole task is to fetch and summarize the schema structure of a database. @@ -68,7 +68,5 @@ ``` - Focus **only** on fetching and summarizing schema details. """, - tools=[ - get_schema_details - ], + tools=[get_schema_details], ) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py index 08cb65e..2935e8c 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -13,6 +13,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: db_type = metadata.get("db_type") host = metadata.get("host") @@ -21,19 +22,28 @@ def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: user = metadata.get("user") if not all([db_type, host, port, dbname, user, password is not None]): - raise ValueError("Missing one or more required connection parameters in metadata or password.") + raise ValueError( + "Missing one or more required connection parameters in metadata or password." + ) port = int(port) - logger.info(f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}") + logger.info( + f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}" + ) if db_type == "postgresql": - return psycopg2.connect(host=host, port=port, dbname=dbname, user=user, password=password) + return psycopg2.connect( + host=host, port=port, dbname=dbname, user=user, password=password + ) elif db_type == "mysql": - return mysql.connector.connect(host=host, port=port, database=dbname, user=user, password=password) + return mysql.connector.connect( + host=host, port=port, database=dbname, user=user, password=password + ) elif db_type == "mssql": conn_str = f"DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={host},{port};DATABASE={dbname};UID={user};PWD={password}" return pyodbc.connect(conn_str) else: raise ValueError(f"Unsupported database type: {db_type}") + def _generate_summary(schema_details: Dict[str, Any]) -> Dict[str, int]: """Generates a summary of the introspected schema structure.""" summary = { @@ -52,7 +62,10 @@ def _generate_summary(schema_details: Dict[str, Any]) -> Dict[str, int]: summary["indexes"] += len(table_info.get("indexes", [])) return summary -async def get_schema_details(tool_context: ToolContext, args: Dict[str, Any]) -> Dict[str, Any]: + +async def get_schema_details( + tool_context: ToolContext, args: Dict[str, Any] +) -> Dict[str, Any]: """ Retrieves detailed schema information and a summary for the given schema_name. Updates the session state with the selected_schema and schema_structure. @@ -81,10 +94,14 @@ async def get_schema_details(tool_context: ToolContext, args: Dict[str, Any]) -> conn = None try: conn = _get_db_connection(metadata, password) - logger.info(f"Successfully reconnected to {db_type} for introspection of schema '{schema_name}'.") + logger.info( + f"Successfully reconnected to {db_type} for introspection of schema '{schema_name}'." + ) if db_type == "postgresql": - schema_details = postgresql_utils.get_postgres_schema_details(conn, schema_name) + schema_details = postgresql_utils.get_postgres_schema_details( + conn, schema_name + ) elif db_type == "mysql": schema_details = mysql_utils.get_mysql_schema_details(conn, schema_name) elif db_type == "mssql": @@ -101,12 +118,16 @@ async def get_schema_details(tool_context: ToolContext, args: Dict[str, Any]) -> "status": "success", "message": f"Schema details for '{schema_name}' ({db_type}) retrieved and stored.", "schema_name": schema_name, - "summary": summary # Include the summary + "summary": summary, # Include the summary } except Exception as e: logger.error(f"Error during schema introspection: {e}", exc_info=True) - return {"error": f"Failed to get schema details for {db_type} ({schema_name}): {str(e)}"} + return { + "error": f"Failed to get schema details for {db_type} ({schema_name}): {str(e)}" + } finally: if conn: - try: conn.close() - except Exception as e: logger.error(f"Error closing {db_type} connection: {e}") + try: + conn.close() + except Exception as e: + logger.error(f"Error closing {db_type} connection: {e}") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index d94992b..e00f2bc 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -19,10 +19,14 @@ GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") if not GOOGLE_CLOUD_PROJECT: - logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") -GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") MODEL = os.environ.get("MODEL", "gemini-2.5-pro") client = None @@ -33,11 +37,16 @@ project=GOOGLE_CLOUD_PROJECT, location=GOOGLE_CLOUD_LOCATION, ) - logger.info(f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") + logger.info( + f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) except Exception as e: logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") else: - logger.error("Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set.") + logger.error( + "Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set." + ) + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" @@ -52,19 +61,22 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} for table_name, table_info in schema_details.get("tables", {}).items(): tables_context[table_name] = { "columns": list(table_info.get("columns", {}).keys()), - "constraints": table_info.get("constraints", []) + "constraints": table_info.get("constraints", []), } context = { "db_type": db_type, "schema_name": schema_name, "tables": tables_context, - "existing_foreign_keys": schema_details.get("foreign_keys", []) + "existing_foreign_keys": schema_details.get("foreign_keys", []), } context_json = json.dumps(context, indent=4) prompt = f""" @@ -123,18 +135,26 @@ def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[s """ return prompt + def _extract_json_content(text: str) -> str: """Extracts JSON content from Markdown-style code fences (```json ... ```).""" - if not text: return "" + if not text: + return "" match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) extracted = match.group(1).strip() if match else text.strip() return extracted -def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> Dict[str, List[Dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") - return {"inferred_relationships": [], "anomalies": [{"error": "LLM client not available."}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}], + } prompt = _construct_llm_prompt(schema_name, db_type, schema_details) logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") @@ -149,25 +169,47 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, generated_text = response.candidates[0].content.parts[0].text logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) - logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) llm_output = json.loads(cleaned_json) inferred = llm_output.get("inferred_relationships", []) anomalies = llm_output.get("anomalies", []) if not isinstance(inferred, list) or not isinstance(anomalies, list): - raise ValueError("LLM response is not in the expected list format for keys.") + raise ValueError( + "LLM response is not in the expected list format for keys." + ) return {"inferred_relationships": inferred, "anomalies": anomalies} except json.JSONDecodeError as e: - logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") - return {"inferred_relationships": [], "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}]} + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], + } except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: logger.error(f"Error calling LLM or processing response: {e}") - return {"inferred_relationships": [], "anomalies": [{"error": f"LLM analysis failed: {e}"}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}], + } except Exception as e: logger.error(f"Unexpected error during LLM analysis: {e}", exc_info=True) - return {"inferred_relationships": [], "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], + } + def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: - details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} + details = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") tables_query = f""" @@ -177,16 +219,20 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: """ tables = _execute_query(conn, tables_query) for table in tables: - t_name = table['table_name'] + t_name = table["table_name"] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} cols_query = f""" SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; """ for col in _execute_query(conn, cols_query): - details["tables"][t_name]["columns"][col['column_name']] = { - "type": col['data_type'], "length": col['character_maximum_length'], "precision": col['numeric_precision'], - "scale": col['numeric_scale'], "nullable": col['is_nullable'] == 'YES', "default": col['column_default'], + details["tables"][t_name]["columns"][col["column_name"]] = { + "type": col["data_type"], + "length": col["character_maximum_length"], + "precision": col["numeric_precision"], + "scale": col["numeric_scale"], + "nullable": col["is_nullable"] == "YES", + "default": col["column_default"], } constraints_query = f""" SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause @@ -195,7 +241,9 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; """ - details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) indexes_query = f""" SELECT t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique @@ -207,12 +255,21 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: - if index['column_name']: - idx_name = index['index_name'] - if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} - if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) + if index["column_name"]: + idx_name = index["index_name"] + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["is_unique"], + } + if index["column_name"] not in grouped_indexes[idx_name]["columns"]: + grouped_indexes[idx_name]["columns"].append( + index["column_name"] + ) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) - except Exception as e: logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") + except Exception as e: + logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") fks_query = f""" SELECT @@ -226,11 +283,18 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: """ details["foreign_keys"] = _execute_query(conn, fks_query) views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" - details["views"] = {view['view_name']: {"definition": view['view_definition']} for view in _execute_query(conn, views_query)} + details["views"] = { + view["view_name"]: {"definition": view["view_definition"]} + for view in _execute_query(conn, views_query) + } llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) details["anomalies"] = llm_analysis.get("anomalies", []) - logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL.") - logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL.") + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL." + ) + logger.info( + f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL." + ) return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py index 1e3a2fc..0d7fdc7 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -19,10 +19,14 @@ GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") if not GOOGLE_CLOUD_PROJECT: - logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") -GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") MODEL = "gemini-2.5-pro" client = None @@ -33,12 +37,15 @@ project=GOOGLE_CLOUD_PROJECT, location=GOOGLE_CLOUD_LOCATION, ) - logger.info(f"GenAI Client initialized. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") + logger.info( + f"GenAI Client initialized. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) except Exception as e: logger.error(f"Failed to initialize GenAI Client: {e}") else: logger.error("Cannot initialize GenAI Client: GOOGLE_CLOUD_PROJECT is not set.") + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts.""" cursor = conn.cursor(dictionary=True) @@ -48,21 +55,24 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} for table_name, table_info in schema_details.get("tables", {}).items(): tables_context[table_name] = { "columns": list(table_info.get("columns", {}).keys()), - "constraints": table_info.get("constraints", []) + "constraints": table_info.get("constraints", []), } context = { "db_type": db_type, "schema_name": schema_name, "tables": tables_context, - "existing_foreign_keys": schema_details.get("foreign_keys", []) + "existing_foreign_keys": schema_details.get("foreign_keys", []), } # Format JSON for readability @@ -124,6 +134,7 @@ def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[s """ return prompt + def _extract_json_content(text: str) -> str: """ Extracts JSON content from Markdown-style code fences (```json ... ```). @@ -144,13 +155,16 @@ def _extract_json_content(text: str) -> str: except json.JSONDecodeError: return extracted -def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> Dict[str, List[Dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") return { "inferred_relationships": [], - "anomalies": [{"error": "LLM client not available."}] + "anomalies": [{"error": "LLM client not available."}], } prompt = _construct_llm_prompt(schema_name, db_type, schema_details) @@ -167,7 +181,9 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, # handles ```json blocks cleaned_json = _extract_json_content(generated_text) - logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) # Parse the cleaned JSON llm_output = json.loads(cleaned_json) @@ -175,32 +191,34 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, anomalies = llm_output.get("anomalies", []) if not isinstance(inferred, list) or not isinstance(anomalies, list): - raise ValueError("LLM response is not in the expected list format for keys.") + raise ValueError( + "LLM response is not in the expected list format for keys." + ) - return { - "inferred_relationships": inferred, - "anomalies": anomalies - } + return {"inferred_relationships": inferred, "anomalies": anomalies} except json.JSONDecodeError as e: - logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) return { "inferred_relationships": [], - "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}] + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], } except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: logger.error(f"Error calling LLM or processing response: {e}") return { "inferred_relationships": [], - "anomalies": [{"error": f"LLM analysis failed: {e}"}] + "anomalies": [{"error": f"LLM analysis failed: {e}"}], } except Exception as e: logger.error(f"Unexpected error during LLM analysis: {e}") return { "inferred_relationships": [], - "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}] + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], } + def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: # For MySQL, schema_name is the database name. logger.info(f"Fetching MySQL schema details for: {schema_name}") @@ -210,7 +228,13 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: logger.error(f"MySQL change database failed: {err}") raise - details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} + details = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } # 1. Fetch Basic Schema Info tables_query = "SHOW FULL TABLES WHERE Table_type = 'BASE TABLE';" @@ -222,9 +246,12 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: cols_query = f"DESCRIBE `{t_name}`;" columns = _execute_query(conn, cols_query) for col in columns: - details["tables"][t_name]["columns"][col['Field']] = { - "type": col['Type'], "nullable": col['Null'] == 'YES', "default": col['Default'], - "key": col['Key'], "extra": col['Extra'], + details["tables"][t_name]["columns"][col["Field"]] = { + "type": col["Type"], + "nullable": col["Null"] == "YES", + "default": col["Default"], + "key": col["Key"], + "extra": col["Extra"], } constraints_query = f""" @@ -235,16 +262,22 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: WHERE TC.TABLE_SCHEMA = '{schema_name}' AND TC.TABLE_NAME = '{t_name}' AND TC.CONSTRAINT_TYPE IN ('PRIMARY KEY', 'UNIQUE', 'FOREIGN KEY', 'CHECK'); """ - details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) indexes_query = f"SHOW INDEX FROM `{t_name}`;" indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: - idx_name = index['Key_name'] + idx_name = index["Key_name"] if idx_name not in grouped_indexes: - grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['Non_unique'] == 0} - grouped_indexes[idx_name]["columns"].append(index['Column_name']) + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["Non_unique"] == 0, + } + grouped_indexes[idx_name]["columns"].append(index["Column_name"]) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) fks_query = f""" @@ -261,7 +294,7 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: try: definition_query = f"SHOW CREATE VIEW `{v_name}`;" definition = _execute_query(conn, definition_query) - details["views"][v_name] = {"definition": definition[0]['Create View']} + details["views"][v_name] = {"definition": definition[0]["Create View"]} except Exception as e: logger.warning(f"Could not fetch view definition for {v_name}: {e}") details["views"][v_name] = {"definition": "N/A"} @@ -271,11 +304,13 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) details["anomalies"] = llm_analysis.get("anomalies", []) - logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships.") + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships." + ) logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies.") logger.debug("************************") logger.info(details) logger.debug("************************") - + return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index 9f2a678..7371f55 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -19,10 +19,14 @@ GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") if not GOOGLE_CLOUD_PROJECT: - logger.warning("GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials.") + logger.warning( + "GOOGLE_CLOUD_PROJECT not set in environment or Application Default Credentials." + ) GOOGLE_CLOUD_LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1") -GOOGLE_GENAI_USE_VERTEXAI = os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "True").lower() in ("true", "1") +GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( + "GOOGLE_GENAI_USE_VERTEXAI", "True" +).lower() in ("true", "1") MODEL = os.environ.get("MODEL", "gemini-2.5-pro") client = None @@ -33,11 +37,16 @@ project=GOOGLE_CLOUD_PROJECT, location=GOOGLE_CLOUD_LOCATION, ) - logger.info(f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}") + logger.info( + f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + ) except Exception as e: logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") else: - logger.error("Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set.") + logger.error( + "Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set." + ) + def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" @@ -52,19 +61,22 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: finally: cursor.close() -def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> str: + +def _construct_llm_prompt( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} for table_name, table_info in schema_details.get("tables", {}).items(): tables_context[table_name] = { "columns": list(table_info.get("columns", {}).keys()), - "constraints": table_info.get("constraints", []) + "constraints": table_info.get("constraints", []), } context = { "db_type": db_type, "schema_name": schema_name, "tables": tables_context, - "existing_foreign_keys": schema_details.get("foreign_keys", []) + "existing_foreign_keys": schema_details.get("foreign_keys", []), } context_json = json.dumps(context, indent=4) prompt = f""" @@ -123,9 +135,11 @@ def _construct_llm_prompt(schema_name: str, db_type: str, schema_details: Dict[s """ return prompt + def _extract_json_content(text: str) -> str: """Extracts JSON content from Markdown-style code fences (```json ... ```).""" - if not text: return "" + if not text: + return "" match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL) extracted = match.group(1).strip() if match else text.strip() try: @@ -134,11 +148,17 @@ def _extract_json_content(text: str) -> str: except json.JSONDecodeError: return extracted -def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]: + +def _analyze_with_llm( + schema_name: str, db_type: str, schema_details: Dict[str, Any] +) -> Dict[str, List[Dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") - return {"inferred_relationships": [], "anomalies": [{"error": "LLM client not available."}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": "LLM client not available."}], + } prompt = _construct_llm_prompt(schema_name, db_type, schema_details) logger.info(f"Sending prompt to LLM for {db_type} relationship analysis.") @@ -153,40 +173,66 @@ def _analyze_with_llm(schema_name: str, db_type: str, schema_details: Dict[str, generated_text = response.candidates[0].content.parts[0].text logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) - logger.debug(f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}") + logger.debug( + f"****** Cleaned JSON Extracted from LLM Response:\n{cleaned_json}" + ) llm_output = json.loads(cleaned_json) inferred = llm_output.get("inferred_relationships", []) anomalies = llm_output.get("anomalies", []) if not isinstance(inferred, list) or not isinstance(anomalies, list): - raise ValueError("LLM response is not in the expected list format for keys.") + raise ValueError( + "LLM response is not in the expected list format for keys." + ) return {"inferred_relationships": inferred, "anomalies": anomalies} except json.JSONDecodeError as e: - logger.error(f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}") - return {"inferred_relationships": [], "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}]} + logger.error( + f"Error decoding LLM JSON response: {e}. Cleaned Response: {cleaned_json}" + ) + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM response was not valid JSON: {e}"}], + } except (exceptions.GoogleAPICallError, IndexError, AttributeError, ValueError) as e: logger.error(f"Error calling LLM or processing response: {e}") - return {"inferred_relationships": [], "anomalies": [{"error": f"LLM analysis failed: {e}"}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": f"LLM analysis failed: {e}"}], + } except Exception as e: logger.error(f"Unexpected error during LLM analysis: {e}") - return {"inferred_relationships": [], "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}]} + return { + "inferred_relationships": [], + "anomalies": [{"error": f"Unexpected LLM analysis error: {e}"}], + } + def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: - details = {"tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": []} + details = { + "tables": {}, + "views": {}, + "foreign_keys": [], + "inferred_relationships": [], + "anomalies": [], + } logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") tables_query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE';" tables = _execute_query(conn, tables_query) for table in tables: - t_name = table['table_name'] + t_name = table["table_name"] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} cols_query = f""" SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; """ for col in _execute_query(conn, cols_query): - details["tables"][t_name]["columns"][col['column_name']] = { - "type": col['data_type'], "length": col['character_maximum_length'], "precision": col['numeric_precision'], - "scale": col['numeric_scale'], "nullable": col['is_nullable'] == 'YES', "default": col['column_default'], + details["tables"][t_name]["columns"][col["column_name"]] = { + "type": col["data_type"], + "length": col["character_maximum_length"], + "precision": col["numeric_precision"], + "scale": col["numeric_scale"], + "nullable": col["is_nullable"] == "YES", + "default": col["column_default"], } constraints_query = f""" SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause @@ -195,7 +241,9 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; """ - details["tables"][t_name]["constraints"] = _execute_query(conn, constraints_query) + details["tables"][t_name]["constraints"] = _execute_query( + conn, constraints_query + ) indexes_query = f""" SELECT t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique FROM pg_class t JOIN pg_index ix ON t.oid = ix.indrelid JOIN pg_class i ON i.oid = ix.indexrelid @@ -206,12 +254,21 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: - if index['column_name']: - idx_name = index['index_name'] - if idx_name not in grouped_indexes: grouped_indexes[idx_name] = {"name": idx_name, "columns": [], "unique": index['is_unique']} - if index['column_name'] not in grouped_indexes[idx_name]["columns"]: grouped_indexes[idx_name]["columns"].append(index['column_name']) + if index["column_name"]: + idx_name = index["index_name"] + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["is_unique"], + } + if index["column_name"] not in grouped_indexes[idx_name]["columns"]: + grouped_indexes[idx_name]["columns"].append( + index["column_name"] + ) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) - except Exception as e: logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") + except Exception as e: + logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") fks_query = f""" SELECT tc.constraint_name, tc.table_name AS from_table, kcu.column_name AS from_column, @@ -224,11 +281,16 @@ def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: """ details["foreign_keys"] = _execute_query(conn, fks_query) views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" - for view in _execute_query(conn, views_query): details["views"][view['view_name']] = {"definition": view['view_definition']} + for view in _execute_query(conn, views_query): + details["views"][view["view_name"]] = {"definition": view["view_definition"]} llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) details["anomalies"] = llm_analysis.get("anomalies", []) - logger.info(f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL.") - logger.info(f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL.") + logger.info( + f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL." + ) + logger.info( + f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL." + ) return details From 7a9b6d9c6fe0ffcfb6b7cc80bf333955f916f73d Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Wed, 19 Nov 2025 17:46:51 +0530 Subject: [PATCH 7/8] feat(data-model-discovery-agent): fix lint issues --- agent-app/app/agent.py | 3 +-- .../data_model_discovery_agent/agent.py | 14 +++++----- .../sub_agents/data_profiling_agent/agent.py | 6 +++-- .../sub_agents/data_profiling_agent/tools.py | 22 ++++++++-------- .../utils/mssql_profiling_utils.py | 12 ++++----- .../utils/mysql_profiling_utils.py | 8 +++--- .../utils/postgres_profiling_utils.py | 12 ++++----- .../sub_agents/database_cred_agent/agent.py | 6 +++-- .../sub_agents/database_cred_agent/tools.py | 12 ++++----- .../sub_agents/qa_agent/agent.py | 5 +++- .../sub_agents/reporting_agent/agent.py | 7 +++-- .../sub_agents/reporting_agent/tools.py | 26 +++++++------------ .../schema_introspection_agent/agent.py | 8 +++--- .../schema_introspection_agent/tools.py | 19 +++++++------- .../utils/mssql_utils.py | 20 +++++++------- .../utils/mysql_utils.py | 23 ++++++++-------- .../utils/postgresql_utils.py | 20 +++++++------- 17 files changed, 114 insertions(+), 109 deletions(-) diff --git a/agent-app/app/agent.py b/agent-app/app/agent.py index 05774db..7818752 100644 --- a/agent-app/app/agent.py +++ b/agent-app/app/agent.py @@ -28,6 +28,7 @@ mosaic_rag_agent_presales, ) from .sub_agents.compliance_and_security_baseline_agent import compliance_agent +from .sub_agents.data_model_discovery_agent import data_model_discovery_agent from .sub_agents.detailed_architecture_design_agent import ( detailed_architecture_design_agent, ) @@ -38,8 +39,6 @@ from .sub_agents.recommendation_agent import recommendation_agent from .sub_agents.strategy_recommender_agent import strategy_recommender_agent from .sub_agents.tech_stack_profiler_agent import tech_stack_profiler -from .sub_agents.data_model_discovery_agent import data_model_discovery_agent - from .tools import ( transfer_to_capability_mapper_agent_tool, transfer_to_discovery_agent_tool, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py index b4da3c9..761baee 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/agent.py @@ -1,13 +1,15 @@ +import logging + from google.adk.agents.llm_agent import LlmAgent from google.adk.agents.readonly_context import ReadonlyContext + +from app.config import MODEL + +from .sub_agents.data_profiling_agent.agent import data_profiling_agent from .sub_agents.database_cred_agent.agent import database_cred_agent -from .sub_agents.schema_introspection_agent.agent import schema_introspection_agent from .sub_agents.qa_agent.agent import qa_agent -from .sub_agents.data_profiling_agent.agent import data_profiling_agent from .sub_agents.reporting_agent.agent import reporting_agent - -import logging -import json +from .sub_agents.schema_introspection_agent.agent import schema_introspection_agent logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -200,7 +202,7 @@ def root_agent_instruction(ctx: ReadonlyContext) -> str: data_model_discovery_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="data_model_discovery_agent", description=( "A helpful root agent that orchestrates sub-agents to introspect and profile legacy databases." diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py index 7b3c499..cbb7911 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/agent.py @@ -1,9 +1,11 @@ from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + from .tools import profile_schema_data -from ..qa_agent.agent import qa_agent data_profiling_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="data_profiling_agent", description="Profiles data quality for the selected schema and then calls QA agent to summarize.", instruction=""" diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py index d07dcab..9493e35 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -1,20 +1,22 @@ import logging -from typing import Dict, Any -from google.adk.tools import ToolContext -import psycopg2 +from typing import Any + import mysql.connector +import psycopg2 import pyodbc +from google.adk.tools import ToolContext + from .utils import ( - postgres_profiling_utils, - mysql_profiling_utils, mssql_profiling_utils, + mysql_profiling_utils, + postgres_profiling_utils, ) logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: +def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: db_type = metadata.get("db_type") host = metadata.get("host") port = int(metadata.get("port")) @@ -39,8 +41,8 @@ def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: async def profile_schema_data( - tool_context: ToolContext, args: Dict[str, Any] -) -> Dict[str, Any]: + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: """ Profiles the data in the selected schema based on the schema structure. Calculates nullability, cardinality, orphan records, and type anomalies. @@ -103,9 +105,7 @@ async def profile_schema_data( } except Exception as e: logger.error(f"Error during data profiling: {e}", exc_info=True) - return { - "error": f"Failed to profile data for {db_type} ({schema_name}): {str(e)}" - } + return {"error": f"Failed to profile data for {db_type} ({schema_name}): {e!s}"} finally: if conn: try: diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py index 6e51138..e2a156e 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -1,12 +1,10 @@ import logging -from typing import Dict, Any, List -import pyodbc -from decimal import Decimal +from typing import Any logger = logging.getLogger(__name__) -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for SQL Server.""" cursor = conn.cursor() try: @@ -14,7 +12,7 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: if cursor.description: columns = [column[0] for column in cursor.description] rows = cursor.fetchall() - return [dict(zip(columns, row)) for row in rows] + return [dict(zip(columns, row, strict=False)) for row in rows] return [] finally: cursor.close() @@ -23,9 +21,9 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: def profile_mssql_data( conn: Any, schema_name: str, - schema_structure: Dict[str, Any], + schema_structure: dict[str, Any], sample_size: int = 10000, -) -> Dict[str, Any]: +) -> dict[str, Any]: profile_results = { "nullability": {}, "cardinality": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py index 994e6aa..2eceb0e 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -1,10 +1,10 @@ import logging -from typing import Dict, Any, List +from typing import Any logger = logging.getLogger(__name__) -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: cursor = conn.cursor(dictionary=True) try: cursor.execute(query) @@ -16,9 +16,9 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: def profile_mysql_data( conn: Any, schema_name: str, - schema_structure: Dict[str, Any], + schema_structure: dict[str, Any], sample_size: int = 10000, -) -> Dict[str, Any]: +) -> dict[str, Any]: try: conn.database = schema_name except Exception as e: diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py index 992b814..17d5696 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -1,12 +1,10 @@ import logging -from typing import Dict, Any, List -import psycopg2 -from decimal import Decimal +from typing import Any logger = logging.getLogger(__name__) -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() try: @@ -15,7 +13,7 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: if cursor.description: columns = [desc[0] for desc in cursor.description] rows = cursor.fetchall() - return [dict(zip(columns, row)) for row in rows] + return [dict(zip(columns, row, strict=False)) for row in rows] return [] finally: cursor.close() @@ -24,9 +22,9 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: def profile_postgres_data( conn: Any, schema_name: str, - schema_structure: Dict[str, Any], + schema_structure: dict[str, Any], sample_size: int = 10000, -) -> Dict[str, Any]: +) -> dict[str, Any]: profile_results = { "nullability": {}, "cardinality": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py index e7dd8a8..17341b3 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/agent.py @@ -1,9 +1,11 @@ from google.adk.agents.llm_agent import LlmAgent -from .tools import validate_db_connection +from app.config import MODEL + +from .tools import validate_db_connection database_cred_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="database_cred_agent", description="A helpful assistant that collects and validates database connection details, and lists available schemas.", instruction=""" diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py index a117ab2..287912f 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -1,16 +1,16 @@ -from google.adk.tools import ToolContext import logging -from typing import Dict, Any, List +from typing import Any -import psycopg2 import mysql.connector +import psycopg2 import pyodbc +from google.adk.tools import ToolContext logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -def _get_schemas(conn: Any, db_type: str) -> List[str]: +def _get_schemas(conn: Any, db_type: str) -> list[str]: """Fetches list of schemas/databases based on db type.""" schemas = [] cursor = conn.cursor() @@ -52,8 +52,8 @@ def _get_schemas(conn: Any, db_type: str) -> List[str]: async def validate_db_connection( - connection_details: Dict[str, Any], tool_context: ToolContext -) -> Dict[str, Any]: + connection_details: dict[str, Any], tool_context: ToolContext +) -> dict[str, Any]: """Validates a database connection for PostgreSQL, MySQL, or MSSQL, fetches available schemas, and saves metadata to session memory. diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py index 8a00f0d..7878ca6 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/qa_agent/agent.py @@ -1,8 +1,11 @@ import json from decimal import Decimal + from google.adk.agents.llm_agent import LlmAgent from google.adk.agents.readonly_context import ReadonlyContext +from app.config import MODEL + def json_encoder_default(obj): if isinstance(obj, Decimal): @@ -99,7 +102,7 @@ def qa_agent_instruction(ctx: ReadonlyContext) -> str: qa_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="qa_agent", description="Answers natural language questions about the discovered database schema structure and data profiling results.", instruction=qa_agent_instruction, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py index 79c0d1a..51d5cc7 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/agent.py @@ -1,8 +1,11 @@ from google.adk.agents.llm_agent import LlmAgent -from .tools import generate_summary_report, export_full_report, generate_erd_script + +from app.config import MODEL + +from .tools import export_full_report, generate_erd_script, generate_summary_report reporting_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="reporting_agent", description="Generates reports, exports data, and creates schema diagrams.", instruction=""" diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py index f809c94..7a3dc41 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -1,16 +1,16 @@ +import json import logging -from typing import Dict, Any, List +from typing import Any + from google.adk.tools import ToolContext -import json -import yaml logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) async def generate_summary_report( - tool_context: ToolContext, args: Dict[str, Any] -) -> Dict[str, Any]: + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: """ Generates a high-level summary report of the database analysis. @@ -89,12 +89,6 @@ async def generate_summary_report( return {"status": "success", "report_text": report} -import json -import logging - -logger = logging.getLogger(__name__) - - async def export_full_report(tool_context: ToolContext, args: dict) -> dict: """ Exports the full schema structure and data profile as a clean JSON report. @@ -166,12 +160,12 @@ def safe_encoder(obj): except Exception as e: logger.error(f"Error generating JSON report: {e}", exc_info=True) - return {"status": "error", "error": f"Failed to generate JSON report: {str(e)}"} + return {"status": "error", "error": f"Failed to generate JSON report: {e!s}"} async def generate_erd_script( - tool_context: ToolContext, args: Dict[str, Any] -) -> Dict[str, Any]: + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: """ Generates a complete, valid Mermaid ER Diagram script. @@ -236,8 +230,8 @@ def sanitize_datatype(dtype: str) -> str: def format_column( table_name: str, col_name: str, - col_info: Dict[str, Any], - constraints_info: List[Dict[str, Any]], + col_info: dict[str, Any], + constraints_info: list[dict[str, Any]], ) -> str: """Format a column entry with proper constraints for Mermaid.""" dtype = sanitize_datatype(col_info.get("type", "text")) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py index 7215214..6da155b 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/agent.py @@ -1,15 +1,17 @@ from google.adk.agents.llm_agent import LlmAgent + +from app.config import MODEL + from .tools import get_schema_details -import json schema_introspection_agent = LlmAgent( - model="gemini-2.5-flash", + model=MODEL, name="schema_introspection_agent", description="Introspects the selected database schema to discover tables, columns, constraints, relationships, indexes, and views.", instruction=""" ### Role You are a **Database Schema Introspection Agent**. Your sole task is to fetch and summarize the schema structure of a database. - + ### Scope - You can only report **schema-level information**: tables, columns, constraints, indexes, foreign keys, inferred relationships, and anomalies. - Do **not** answer questions about data content, queries, or performance. Forward all other questions to the QA agent using: diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py index 2935e8c..961658f 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -1,20 +1,21 @@ import logging -from typing import Dict, Any, List -from google.adk.tools import ToolContext +from typing import Any + +import mysql.connector # Import database connectors import psycopg2 -import mysql.connector import pyodbc +from google.adk.tools import ToolContext # Import utils -from .utils import postgresql_utils, mysql_utils, mssql_utils +from .utils import mssql_utils, mysql_utils, postgresql_utils logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: +def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: db_type = metadata.get("db_type") host = metadata.get("host") port = metadata.get("port") @@ -44,7 +45,7 @@ def _get_db_connection(metadata: Dict[str, Any], password: str) -> Any: raise ValueError(f"Unsupported database type: {db_type}") -def _generate_summary(schema_details: Dict[str, Any]) -> Dict[str, int]: +def _generate_summary(schema_details: dict[str, Any]) -> dict[str, int]: """Generates a summary of the introspected schema structure.""" summary = { "tables": len(schema_details.get("tables", {})), @@ -64,8 +65,8 @@ def _generate_summary(schema_details: Dict[str, Any]) -> Dict[str, int]: async def get_schema_details( - tool_context: ToolContext, args: Dict[str, Any] -) -> Dict[str, Any]: + tool_context: ToolContext, args: dict[str, Any] +) -> dict[str, Any]: """ Retrieves detailed schema information and a summary for the given schema_name. Updates the session state with the selected_schema and schema_structure. @@ -123,7 +124,7 @@ async def get_schema_details( except Exception as e: logger.error(f"Error during schema introspection: {e}", exc_info=True) return { - "error": f"Failed to get schema details for {db_type} ({schema_name}): {str(e)}" + "error": f"Failed to get schema details for {db_type} ({schema_name}): {e!s}" } finally: if conn: diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index e00f2bc..e49155c 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -1,13 +1,13 @@ -import logging -from typing import Dict, Any, List -import psycopg2 import json +import logging import os import re +from typing import Any + +import google.auth from google import genai from google.api_core import exceptions from google.genai import types -import google.auth logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") @@ -48,7 +48,7 @@ ) -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() try: @@ -56,14 +56,14 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: if cursor.description: columns = [desc[0] for desc in cursor.description] rows = cursor.fetchall() - return [dict(zip(columns, row)) for row in rows] + return [dict(zip(columns, row, strict=False)) for row in rows] return [] finally: cursor.close() def _construct_llm_prompt( - schema_name: str, db_type: str, schema_details: Dict[str, Any] + schema_name: str, db_type: str, schema_details: dict[str, Any] ) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} @@ -146,8 +146,8 @@ def _extract_json_content(text: str) -> str: def _analyze_with_llm( - schema_name: str, db_type: str, schema_details: Dict[str, Any] -) -> Dict[str, List[Dict[str, Any]]]: + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") @@ -202,7 +202,7 @@ def _analyze_with_llm( } -def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: +def get_postgres_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: details = { "tables": {}, "views": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py index 0d7fdc7..0dc64ac 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -1,13 +1,14 @@ -import logging -from typing import Dict, Any, List -import mysql.connector import json +import logging import os import re +from typing import Any + +import google.auth +import mysql.connector from google import genai from google.api_core import exceptions from google.genai import types -import google.auth logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") @@ -46,7 +47,7 @@ logger.error("Cannot initialize GenAI Client: GOOGLE_CLOUD_PROJECT is not set.") -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts.""" cursor = conn.cursor(dictionary=True) try: @@ -57,7 +58,7 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: def _construct_llm_prompt( - schema_name: str, db_type: str, schema_details: Dict[str, Any] + schema_name: str, db_type: str, schema_details: dict[str, Any] ) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" @@ -157,8 +158,8 @@ def _extract_json_content(text: str) -> str: def _analyze_with_llm( - schema_name: str, db_type: str, schema_details: Dict[str, Any] -) -> Dict[str, List[Dict[str, Any]]]: + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") @@ -219,7 +220,7 @@ def _analyze_with_llm( } -def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: +def get_mysql_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: # For MySQL, schema_name is the database name. logger.info(f"Fetching MySQL schema details for: {schema_name}") try: @@ -239,7 +240,7 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: # 1. Fetch Basic Schema Info tables_query = "SHOW FULL TABLES WHERE Table_type = 'BASE TABLE';" tables = _execute_query(conn, tables_query) - table_names = [list(t.values())[0] for t in tables] + table_names = [next(iter(t.values())) for t in tables] for t_name in table_names: details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} @@ -290,7 +291,7 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: views_query = "SHOW FULL TABLES WHERE Table_type = 'VIEW';" views = _execute_query(conn, views_query) - for v_name in [list(v.values())[0] for v in views]: + for v_name in [next(iter(v.values())) for v in views]: try: definition_query = f"SHOW CREATE VIEW `{v_name}`;" definition = _execute_query(conn, definition_query) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index 7371f55..d1d57e6 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -1,13 +1,13 @@ -import logging -from typing import Dict, Any, List -import psycopg2 import json +import logging import os import re +from typing import Any + +import google.auth from google import genai from google.api_core import exceptions from google.genai import types -import google.auth logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") @@ -48,7 +48,7 @@ ) -def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: +def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" cursor = conn.cursor() try: @@ -56,14 +56,14 @@ def _execute_query(conn: Any, query: str) -> List[Dict[str, Any]]: if cursor.description: columns = [desc[0] for desc in cursor.description] rows = cursor.fetchall() - return [dict(zip(columns, row)) for row in rows] + return [dict(zip(columns, row, strict=False)) for row in rows] return [] finally: cursor.close() def _construct_llm_prompt( - schema_name: str, db_type: str, schema_details: Dict[str, Any] + schema_name: str, db_type: str, schema_details: dict[str, Any] ) -> str: """Constructs a prompt for the LLM to analyze relationships and anomalies with formatted JSON.""" tables_context = {} @@ -150,8 +150,8 @@ def _extract_json_content(text: str) -> str: def _analyze_with_llm( - schema_name: str, db_type: str, schema_details: Dict[str, Any] -) -> Dict[str, List[Dict[str, Any]]]: + schema_name: str, db_type: str, schema_details: dict[str, Any] +) -> dict[str, list[dict[str, Any]]]: """Calls an LLM to get inferred relationships and anomalies.""" if not client: logger.error("GenAI Client not initialized. Skipping LLM analysis.") @@ -206,7 +206,7 @@ def _analyze_with_llm( } -def get_postgres_schema_details(conn: Any, schema_name: str) -> Dict[str, Any]: +def get_postgres_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: details = { "tables": {}, "views": {}, From dd05e5a3fb2c957f45af54797ebb3c214092dfeb Mon Sep 17 00:00:00 2001 From: SanuGhosh Date: Thu, 20 Nov 2025 23:06:00 +0530 Subject: [PATCH 8/8] feat(data-model-discovery-agent): fix ruff issues --- .../sub_agents/data_profiling_agent/tools.py | 5 +- .../utils/mssql_profiling_utils.py | 2 +- .../utils/mysql_profiling_utils.py | 2 +- .../utils/postgres_profiling_utils.py | 2 +- .../sub_agents/database_cred_agent/tools.py | 10 +- .../sub_agents/reporting_agent/tools.py | 13 -- .../schema_introspection_agent/tools.py | 4 +- .../utils/mssql_utils.py | 127 +++++++++--------- .../utils/mysql_utils.py | 8 +- .../utils/postgresql_utils.py | 8 +- agent-app/uv.lock | 71 +++++++++- 11 files changed, 152 insertions(+), 100 deletions(-) diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py index 9493e35..d632bb4 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/tools.py @@ -19,7 +19,8 @@ def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: db_type = metadata.get("db_type") host = metadata.get("host") - port = int(metadata.get("port")) + port_value = metadata.get("port") + port = int(port_value) if port_value is not None else None dbname = metadata.get("dbname") user = metadata.get("user") logger.info( @@ -48,8 +49,6 @@ async def profile_schema_data( Calculates nullability, cardinality, orphan records, and type anomalies. Sets a flag on successful completion. """ - if not isinstance(args, dict): - return {"error": "Invalid arguments. Expected a dictionary for args."} db_conn_state = tool_context.state.get("db_connection") db_creds = tool_context.state.get("db_creds_temp") diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py index e2a156e..8f74741 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mssql_profiling_utils.py @@ -24,7 +24,7 @@ def profile_mssql_data( schema_structure: dict[str, Any], sample_size: int = 10000, ) -> dict[str, Any]: - profile_results = { + profile_results: dict[str, Any] = { "nullability": {}, "cardinality": {}, "orphan_records": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py index 2eceb0e..8b73fa1 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/mysql_profiling_utils.py @@ -25,7 +25,7 @@ def profile_mysql_data( logger.error(f"Failed to set database {schema_name}: {e}") raise - profile_results = { + profile_results: dict[str, Any] = { "nullability": {}, "cardinality": {}, "orphan_records": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py index 17d5696..9217f46 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/data_profiling_agent/utils/postgres_profiling_utils.py @@ -25,7 +25,7 @@ def profile_postgres_data( schema_structure: dict[str, Any], sample_size: int = 10000, ) -> dict[str, Any]: - profile_results = { + profile_results: dict[str, dict] = { "nullability": {}, "cardinality": {}, "orphan_records": {}, diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py index 287912f..810c5e7 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/database_cred_agent/tools.py @@ -121,11 +121,11 @@ async def validate_db_connection( # Clear any previous connection state if "db_connection" in tool_context.state: - del tool_context.state["db_connection"] + tool_context.state["db_connection"] = None if "db_creds_temp" in tool_context.state: - del tool_context.state["db_creds_temp"] + tool_context.state["db_creds_temp"] = None if "selected_schema" in tool_context.state: - del tool_context.state["selected_schema"] + tool_context.state["selected_schema"] = None tool_context.state["db_connection"] = { "metadata": { @@ -151,9 +151,9 @@ async def validate_db_connection( except Exception as e: logger.error(f"Database connection or schema fetch failed for {db_type}: {e}") if "db_connection" in tool_context.state: - del tool_context.state["db_connection"] + tool_context.state["db_connection"] = None if "db_creds_temp" in tool_context.state: - del tool_context.state["db_creds_temp"] + tool_context.state["db_creds_temp"] = None return { "status": "error", "message": f"Connection/Schema fetch failed for {db_type}: {e}", diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py index 7a3dc41..9d8a4e7 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/reporting_agent/tools.py @@ -28,8 +28,6 @@ async def generate_summary_report( - report_text: The markdown formatted summary report (on success). - error: An error message (on failure). """ - if not isinstance(args, dict): - return {"error": "Invalid arguments. Expected a dictionary for args."} schema_structure = tool_context.state.get("schema_structure") data_profile = tool_context.state.get("data_profile") @@ -108,11 +106,6 @@ async def export_full_report(tool_context: ToolContext, args: dict) -> dict: "error": Optional error message } """ - if not isinstance(args, dict): - return { - "status": "error", - "error": "Invalid arguments. Expected a dictionary for args.", - } schema_structure = tool_context.state.get("schema_structure") data_profile = tool_context.state.get("data_profile") @@ -193,12 +186,6 @@ async def generate_erd_script( } """ - if not isinstance(args, dict): - return { - "status": "error", - "error": "Invalid arguments. Expected a dictionary for args.", - } - schema_structure = tool_context.state.get("schema_structure") if not schema_structure: return { diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py index 961658f..69a51e4 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/tools.py @@ -26,7 +26,7 @@ def _get_db_connection(metadata: dict[str, Any], password: str) -> Any: raise ValueError( "Missing one or more required connection parameters in metadata or password." ) - port = int(port) + port = int(port) # type: ignore[arg-type] logger.info( f"Attempting to connect to {db_type} at {host}:{port} as {user} to database {dbname}" ) @@ -86,7 +86,7 @@ async def get_schema_details( tool_context.state["selected_schema"] = schema_name if "available_schemas" in tool_context.state: - del tool_context.state["available_schemas"] + tool_context.state["available_schemas"] = None metadata = db_conn_state["metadata"] password = db_creds["password"] diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py index e49155c..df061f8 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mssql_utils.py @@ -5,6 +5,7 @@ from typing import Any import google.auth +import pyodbc from google import genai from google.api_core import exceptions from google.genai import types @@ -12,11 +13,12 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + try: _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) except google.auth.exceptions.DefaultCredentialsError: - GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] if not GOOGLE_CLOUD_PROJECT: logger.warning( @@ -27,7 +29,7 @@ GOOGLE_GENAI_USE_VERTEXAI = os.environ.get( "GOOGLE_GENAI_USE_VERTEXAI", "True" ).lower() in ("true", "1") -MODEL = os.environ.get("MODEL", "gemini-2.5-pro") +MODEL = os.environ.get("MODEL", "gemini-1.5-pro") client = None if GOOGLE_CLOUD_PROJECT: @@ -38,26 +40,30 @@ location=GOOGLE_CLOUD_LOCATION, ) logger.info( - f"GenAI Client initialized in postgres_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" + f"GenAI Client initialized in mssql_utils. VertexAI: {GOOGLE_GENAI_USE_VERTEXAI}, Project: {GOOGLE_CLOUD_PROJECT}, Location: {GOOGLE_CLOUD_LOCATION}, Model: {MODEL}" ) except Exception as e: - logger.error(f"Failed to initialize GenAI Client in postgres_utils: {e}") + logger.error(f"Failed to initialize GenAI Client in mssql_utils: {e}") else: logger.error( - "Cannot initialize GenAI Client in postgres_utils: GOOGLE_CLOUD_PROJECT is not set." + "Cannot initialize GenAI Client in mssql_utils: GOOGLE_CLOUD_PROJECT is not set." ) def _execute_query(conn: Any, query: str) -> list[dict[str, Any]]: - """Executes a SQL query and returns results as a list of dicts for PostgreSQL.""" + """Executes a SQL query and returns results as a list of dicts for SQL Server.""" cursor = conn.cursor() try: cursor.execute(query) if cursor.description: - columns = [desc[0] for desc in cursor.description] + columns = [column[0] for column in cursor.description] rows = cursor.fetchall() return [dict(zip(columns, row, strict=False)) for row in rows] return [] + except pyodbc.Error as ex: + sqlstate = ex.args[0] + logger.error(f"SQL Error ({sqlstate}): {ex} for query: {query}") + raise finally: cursor.close() @@ -163,10 +169,10 @@ def _analyze_with_llm( logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, - contents=[types.Part.from_text(text=prompt)], + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] config=types.GenerateContentConfig(response_mime_type="application/json"), ) - generated_text = response.candidates[0].content.parts[0].text + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) logger.debug( @@ -202,99 +208,90 @@ def _analyze_with_llm( } -def get_postgres_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: - details = { +def get_mssql_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: + logger.info(f"Fetching MSSQL schema details for: {schema_name}") + details: dict[str, Any] = { "tables": {}, "views": {}, "foreign_keys": [], "inferred_relationships": [], "anomalies": [], } - logger.info(f"Fetching PostgreSQL schema details for: {schema_name}") - tables_query = f""" - SELECT table_name - FROM information_schema.tables - WHERE table_schema = '{schema_name}' AND table_type = 'BASE TABLE'; - """ + tables_query = f"SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_TYPE = 'BASE TABLE';" tables = _execute_query(conn, tables_query) for table in tables: - t_name = table["table_name"] + t_name = table["TABLE_NAME"] details["tables"][t_name] = {"columns": {}, "constraints": [], "indexes": []} - cols_query = f""" - SELECT column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, is_nullable, column_default - FROM information_schema.columns WHERE table_schema = '{schema_name}' AND table_name = '{t_name}'; - """ + cols_query = f"SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, NUMERIC_SCALE, IS_NULLABLE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{schema_name}' AND TABLE_NAME = '{t_name}';" for col in _execute_query(conn, cols_query): - details["tables"][t_name]["columns"][col["column_name"]] = { - "type": col["data_type"], - "length": col["character_maximum_length"], - "precision": col["numeric_precision"], - "scale": col["numeric_scale"], - "nullable": col["is_nullable"] == "YES", - "default": col["column_default"], + details["tables"][t_name]["columns"][col["COLUMN_NAME"]] = { + "type": col["DATA_TYPE"], + "length": col["CHARACTER_MAXIMUM_LENGTH"], + "precision": col["NUMERIC_PRECISION"], + "scale": col["NUMERIC_SCALE"], + "nullable": col["IS_NULLABLE"] == "YES", + "default": col["COLUMN_DEFAULT"], } + constraints_query = f""" - SELECT tc.table_name, tc.constraint_name, tc.constraint_type, kcu.column_name, cc.check_clause - FROM information_schema.table_constraints tc - LEFT JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name - LEFT JOIN information_schema.check_constraints cc ON tc.constraint_name = cc.constraint_name AND tc.table_schema = cc.constraint_schema - WHERE tc.table_schema = '{schema_name}' AND tc.table_name = '{t_name}'; + SELECT KCU.TABLE_NAME, TC.CONSTRAINT_NAME, TC.CONSTRAINT_TYPE, KCU.COLUMN_NAME, CC.CHECK_CLAUSE + FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC + LEFT JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KCU ON TC.CONSTRAINT_NAME = KCU.CONSTRAINT_NAME AND TC.TABLE_SCHEMA = KCU.TABLE_SCHEMA AND TC.TABLE_NAME = KCU.TABLE_NAME + LEFT JOIN INFORMATION_SCHEMA.CHECK_CONSTRAINTS AS CC ON TC.CONSTRAINT_NAME = CC.CONSTRAINT_NAME AND TC.CONSTRAINT_SCHEMA = CC.CONSTRAINT_SCHEMA + WHERE TC.TABLE_SCHEMA = '{schema_name}' AND KCU.TABLE_NAME = '{t_name}'; """ details["tables"][t_name]["constraints"] = _execute_query( conn, constraints_query ) + indexes_query = f""" - SELECT - t.relname AS table_name, i.relname AS index_name, a.attname AS column_name, ix.indisunique AS is_unique - FROM pg_class t JOIN pg_index ix ON t.oid = ix.indrelid JOIN pg_class i ON i.oid = ix.indexrelid - LEFT JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey) - JOIN pg_namespace n ON t.relnamespace = n.oid WHERE n.nspname = '{schema_name}' AND t.relname = '{t_name}' AND t.relkind = 'r'; + SELECT t.name AS table_name, ind.name AS index_name, COL_NAME(ic.object_id, ic.column_id) AS column_name, ind.is_unique + FROM sys.indexes ind INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id AND ind.index_id = ic.index_id + INNER JOIN sys.tables t ON ind.object_id = t.object_id INNER JOIN sys.schemas s ON t.schema_id = s.schema_id + WHERE s.name = '{schema_name}' AND t.name = '{t_name}' AND ind.is_hypothetical = 0 AND ind.type > 0; """ try: indexes = _execute_query(conn, indexes_query) grouped_indexes = {} for index in indexes: - if index["column_name"]: - idx_name = index["index_name"] - if idx_name not in grouped_indexes: - grouped_indexes[idx_name] = { - "name": idx_name, - "columns": [], - "unique": index["is_unique"], - } - if index["column_name"] not in grouped_indexes[idx_name]["columns"]: - grouped_indexes[idx_name]["columns"].append( - index["column_name"] - ) + idx_name = index["index_name"] + if not idx_name: + continue + if idx_name not in grouped_indexes: + grouped_indexes[idx_name] = { + "name": idx_name, + "columns": [], + "unique": index["is_unique"], + } + if index["column_name"] not in grouped_indexes[idx_name]["columns"]: + grouped_indexes[idx_name]["columns"].append(index["column_name"]) details["tables"][t_name]["indexes"] = list(grouped_indexes.values()) except Exception as e: - logger.error(f"Error fetching PostgreSQL indexes for {t_name}: {e}") + logger.error(f"Error fetching MSSQL indexes for {t_name}: {e}") fks_query = f""" - SELECT - tc.constraint_name, tc.table_name AS from_table, kcu.column_name AS from_column, - ccu.table_schema AS to_schema, ccu.table_name AS to_table, ccu.column_name AS to_column - FROM information_schema.table_constraints AS tc JOIN information_schema.key_column_usage AS kcu - ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema - JOIN information_schema.constraint_column_usage AS ccu - ON ccu.constraint_name = tc.constraint_name AND ccu.table_schema = tc.table_schema - WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = '{schema_name}'; + SELECT KCU1.CONSTRAINT_NAME AS constraint_name, KCU1.TABLE_NAME AS from_table, KCU1.COLUMN_NAME AS from_column, + KCU2.TABLE_SCHEMA AS to_schema, KCU2.TABLE_NAME AS to_table, KCU2.COLUMN_NAME AS to_column + FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS RC + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU1 ON KCU1.CONSTRAINT_SCHEMA = RC.CONSTRAINT_SCHEMA AND KCU1.CONSTRAINT_NAME = RC.CONSTRAINT_NAME + JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE KCU2 ON KCU2.CONSTRAINT_SCHEMA = RC.UNIQUE_CONSTRAINT_SCHEMA AND KCU2.CONSTRAINT_NAME = RC.UNIQUE_CONSTRAINT_NAME AND KCU2.ORDINAL_POSITION = KCU1.ORDINAL_POSITION + WHERE KCU1.TABLE_SCHEMA = '{schema_name}'; """ details["foreign_keys"] = _execute_query(conn, fks_query) - views_query = f"SELECT table_name AS view_name, view_definition FROM information_schema.views WHERE table_schema = '{schema_name}';" + views_query = f"SELECT TABLE_NAME AS view_name, VIEW_DEFINITION FROM INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA = '{schema_name}';" details["views"] = { - view["view_name"]: {"definition": view["view_definition"]} + view["view_name"]: {"definition": view["VIEW_DEFINITION"]} for view in _execute_query(conn, views_query) } - llm_analysis = _analyze_with_llm(schema_name, "PostgreSQL", details) + llm_analysis = _analyze_with_llm(schema_name, "Microsoft SQL Server", details) details["inferred_relationships"] = llm_analysis.get("inferred_relationships", []) details["anomalies"] = llm_analysis.get("anomalies", []) logger.info( - f"Found {len(details['inferred_relationships'])} potential inferred relationships for PostgreSQL." + f"Found {len(details['inferred_relationships'])} potential inferred relationships for MSSQL." ) logger.info( - f"Found {len(details['anomalies'])} potential relationship anomalies for PostgreSQL." + f"Found {len(details['anomalies'])} potential relationship anomalies for MSSQL." ) return details diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py index 0dc64ac..c244326 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/mysql_utils.py @@ -17,7 +17,7 @@ _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) except google.auth.exceptions.DefaultCredentialsError: - GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] if not GOOGLE_CLOUD_PROJECT: logger.warning( @@ -175,9 +175,9 @@ def _analyze_with_llm( logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, - contents=[types.Part.from_text(text=prompt)], + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] ) - generated_text = response.candidates[0].content.parts[0].text + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] logger.debug(f"****** Raw LLM Response: {generated_text}") # handles ```json blocks @@ -229,7 +229,7 @@ def get_mysql_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: logger.error(f"MySQL change database failed: {err}") raise - details = { + details: dict[str, Any] = { "tables": {}, "views": {}, "foreign_keys": [], diff --git a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py index d1d57e6..2b76be9 100644 --- a/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py +++ b/agent-app/app/sub_agents/data_model_discovery_agent/sub_agents/schema_introspection_agent/utils/postgresql_utils.py @@ -16,7 +16,7 @@ _, project_id = google.auth.default() GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", project_id) except google.auth.exceptions.DefaultCredentialsError: - GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") + GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") # type: ignore[assignment] if not GOOGLE_CLOUD_PROJECT: logger.warning( @@ -167,10 +167,10 @@ def _analyze_with_llm( logger.debug(f"****** Custom_LLM_Request: {prompt}") response = client.models.generate_content( model=MODEL, - contents=[types.Part.from_text(text=prompt)], + contents=[types.Part.from_text(text=prompt)], # type: ignore[arg-type] config=types.GenerateContentConfig(response_mime_type="application/json"), ) - generated_text = response.candidates[0].content.parts[0].text + generated_text = response.candidates[0].content.parts[0].text # type: ignore[index, union-attr, assignment] logger.debug(f"****** Raw LLM Response: {generated_text}") cleaned_json = _extract_json_content(generated_text) logger.debug( @@ -207,7 +207,7 @@ def _analyze_with_llm( def get_postgres_schema_details(conn: Any, schema_name: str) -> dict[str, Any]: - details = { + details: dict[str, Any] = { "tables": {}, "views": {}, "foreign_keys": [], diff --git a/agent-app/uv.lock b/agent-app/uv.lock index 05d78ce..e2d5e54 100644 --- a/agent-app/uv.lock +++ b/agent-app/uv.lock @@ -39,8 +39,8 @@ dependencies = [ { name = "pdfplumber" }, { name = "plantuml" }, { name = "psycopg2-binary" }, - { name = "pyodbc" }, { name = "pygithub" }, + { name = "pyodbc" }, { name = "reportlab" }, { name = "scipy" }, { name = "tabulate" }, @@ -3167,6 +3167,75 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491 }, ] +[[package]] +name = "pynacl" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591 }, + { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866 }, + { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001 }, + { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024 }, + { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766 }, + { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275 }, + { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891 }, + { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291 }, + { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839 }, + { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371 }, + { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031 }, + { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585 }, + { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923 }, + { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970 }, +] + +[[package]] +name = "pyodbc" +version = "5.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8f/85/44b10070a769a56bd910009bb185c0c0a82daff8d567cd1a116d7d730c7d/pyodbc-5.3.0.tar.gz", hash = "sha256:2fe0e063d8fb66efd0ac6dc39236c4de1a45f17c33eaded0d553d21c199f4d05", size = 121770 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/cd/d0ac9e8963cf43f3c0e8ebd284cd9c5d0e17457be76c35abe4998b7b6df2/pyodbc-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6682cdec78f1302d0c559422c8e00991668e039ed63dece8bf99ef62173376a5", size = 71888 }, + { url = "https://files.pythonhosted.org/packages/cb/7b/95ea2795ea8a0db60414e14f117869a5ba44bd52387886c1a210da637315/pyodbc-5.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9cd3f0a9796b3e1170a9fa168c7e7ca81879142f30e20f46663b882db139b7d2", size = 71813 }, + { url = "https://files.pythonhosted.org/packages/95/c9/6f4644b60af513ea1c9cab1ff4af633e8f300e8468f4ae3507f04524e641/pyodbc-5.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46185a1a7f409761716c71de7b95e7bbb004390c650d00b0b170193e3d6224bb", size = 318556 }, + { url = "https://files.pythonhosted.org/packages/19/3f/24876d9cb9c6ce1bd2b6f43f69ebc00b8eb47bf1ed99ee95e340bf90ed79/pyodbc-5.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:349a9abae62a968b98f6bbd23d2825151f8d9de50b3a8f5f3271b48958fdb672", size = 322048 }, + { url = "https://files.pythonhosted.org/packages/1f/27/faf17353605ac60f80136bc3172ed2d69d7defcb9733166293fc14ac2c52/pyodbc-5.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ac23feb7ddaa729f6b840639e92f83ff0ccaa7072801d944f1332cd5f5b05f47", size = 1286123 }, + { url = "https://files.pythonhosted.org/packages/d4/61/c9d407d2aa3e89f9bb68acf6917b0045a788ae8c3f4045c34759cb77af63/pyodbc-5.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8aa396c6d6af52ccd51b8c8a5bffbb46fd44e52ce07ea4272c1d28e5e5b12722", size = 1343502 }, + { url = "https://files.pythonhosted.org/packages/d9/9f/f1b0f3238d873d4930aa2a2b8d5ba97132f6416764bf0c87368f8d6f2139/pyodbc-5.3.0-cp310-cp310-win32.whl", hash = "sha256:46869b9a6555ff003ed1d8ebad6708423adf2a5c88e1a578b9f029fb1435186e", size = 62968 }, + { url = "https://files.pythonhosted.org/packages/d8/26/5f8ebdca4735aad0119aaaa6d5d73b379901b7a1dbb643aaa636040b27cf/pyodbc-5.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:705903acf6f43c44fc64e764578d9a88649eb21bf7418d78677a9d2e337f56f2", size = 69397 }, + { url = "https://files.pythonhosted.org/packages/d1/c8/480a942fd2e87dd7df6d3c1f429df075695ed8ae34d187fe95c64219fd49/pyodbc-5.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:c68d9c225a97aedafb7fff1c0e1bfe293093f77da19eaf200d0e988fa2718d16", size = 64446 }, + { url = "https://files.pythonhosted.org/packages/e0/c7/534986d97a26cb8f40ef456dfcf00d8483161eade6d53fa45fcf2d5c2b87/pyodbc-5.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebc3be93f61ea0553db88589e683ace12bf975baa954af4834ab89f5ee7bf8ae", size = 71958 }, + { url = "https://files.pythonhosted.org/packages/69/3c/6fe3e9eae6db1c34d6616a452f9b954b0d5516c430f3dd959c9d8d725f2a/pyodbc-5.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9b987a25a384f31e373903005554230f5a6d59af78bce62954386736a902a4b3", size = 71843 }, + { url = "https://files.pythonhosted.org/packages/44/0e/81a0315d0bf7e57be24338dbed616f806131ab706d87c70f363506dc13d5/pyodbc-5.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:676031723aac7dcbbd2813bddda0e8abf171b20ec218ab8dfb21d64a193430ea", size = 327191 }, + { url = "https://files.pythonhosted.org/packages/43/ae/b95bb2068f911950322a97172c68675c85a3e87dc04a98448c339fcbef21/pyodbc-5.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5c30c5cd40b751f77bbc73edd32c4498630939bcd4e72ee7e6c9a4b982cc5ca", size = 332228 }, + { url = "https://files.pythonhosted.org/packages/dc/21/2433625f7d5922ee9a34e3805805fa0f1355d01d55206c337bb23ec869bf/pyodbc-5.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2035c7dfb71677cd5be64d3a3eb0779560279f0a8dc6e33673499498caa88937", size = 1296469 }, + { url = "https://files.pythonhosted.org/packages/3a/f4/c760caf7bb9b3ab988975d84bd3e7ebda739fe0075c82f476d04ee97324c/pyodbc-5.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5cbe4d753723c8a8f65020b7a259183ef5f14307587165ce37e8c7e251951852", size = 1353163 }, + { url = "https://files.pythonhosted.org/packages/14/ad/f9ca1e9e44fd91058f6e35b233b1bb6213d590185bfcc2a2c4f1033266e7/pyodbc-5.3.0-cp311-cp311-win32.whl", hash = "sha256:d255f6b117d05cfc046a5201fdf39535264045352ea536c35777cf66d321fbb8", size = 62925 }, + { url = "https://files.pythonhosted.org/packages/e6/cf/52b9b94efd8cfd11890ae04f31f50561710128d735e4e38a8fbb964cd2c2/pyodbc-5.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:f1ad0e93612a6201621853fc661209d82ff2a35892b7d590106fe8f97d9f1f2a", size = 69329 }, + { url = "https://files.pythonhosted.org/packages/8b/6f/bf5433bb345007f93003fa062e045890afb42e4e9fc6bd66acc2c3bd12ca/pyodbc-5.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:0df7ff47fab91ea05548095b00e5eb87ed88ddf4648c58c67b4db95ea4913e23", size = 64447 }, + { url = "https://files.pythonhosted.org/packages/f5/0c/7ecf8077f4b932a5d25896699ff5c394ffc2a880a9c2c284d6a3e6ea5949/pyodbc-5.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5ebf6b5d989395efe722b02b010cb9815698a4d681921bf5db1c0e1195ac1bde", size = 72994 }, + { url = "https://files.pythonhosted.org/packages/03/78/9fbde156055d88c1ef3487534281a5b1479ee7a2f958a7e90714968749ac/pyodbc-5.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:197bb6ddafe356a916b8ee1b8752009057fce58e216e887e2174b24c7ab99269", size = 72535 }, + { url = "https://files.pythonhosted.org/packages/9f/f9/8c106dcd6946e95fee0da0f1ba58cd90eb872eebe8968996a2ea1f7ac3c1/pyodbc-5.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c6ccb5315ec9e081f5cbd66f36acbc820ad172b8fa3736cf7f993cdf69bd8a96", size = 333565 }, + { url = "https://files.pythonhosted.org/packages/4b/30/2c70f47a76a4fafa308d148f786aeb35a4d67a01d41002f1065b465d9994/pyodbc-5.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dd3d5e469f89a3112cf8b0658c43108a4712fad65e576071e4dd44d2bd763c7", size = 340283 }, + { url = "https://files.pythonhosted.org/packages/7d/b2/0631d84731606bfe40d3b03a436b80cbd16b63b022c7b13444fb30761ca8/pyodbc-5.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b180bc5e49b74fd40a24ef5b0fe143d0c234ac1506febe810d7434bf47cb925b", size = 1302767 }, + { url = "https://files.pythonhosted.org/packages/74/b9/707c5314cca9401081b3757301241c167a94ba91b4bd55c8fa591bf35a4a/pyodbc-5.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e3c39de3005fff3ae79246f952720d44affc6756b4b85398da4c5ea76bf8f506", size = 1361251 }, + { url = "https://files.pythonhosted.org/packages/97/7c/893036c8b0c8d359082a56efdaa64358a38dda993124162c3faa35d1924d/pyodbc-5.3.0-cp312-cp312-win32.whl", hash = "sha256:d32c3259762bef440707098010035bbc83d1c73d81a434018ab8c688158bd3bb", size = 63413 }, + { url = "https://files.pythonhosted.org/packages/c0/70/5e61b216cc13c7f833ef87f4cdeab253a7873f8709253f5076e9bb16c1b3/pyodbc-5.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:fe77eb9dcca5fc1300c9121f81040cc9011d28cff383e2c35416e9ec06d4bc95", size = 70133 }, + { url = "https://files.pythonhosted.org/packages/aa/85/e7d0629c9714a85eb4f85d21602ce6d8a1ec0f313fde8017990cf913e3b4/pyodbc-5.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:afe7c4ac555a8d10a36234788fc6cfc22a86ce37fc5ba88a1f75b3e6696665dc", size = 64700 }, + { url = "https://files.pythonhosted.org/packages/0c/1d/9e74cbcc1d4878553eadfd59138364b38656369eb58f7e5b42fb344c0ce7/pyodbc-5.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7e9ab0b91de28a5ab838ac4db0253d7cc8ce2452efe4ad92ee6a57b922bf0c24", size = 72975 }, + { url = "https://files.pythonhosted.org/packages/37/c7/27d83f91b3144d3e275b5b387f0564b161ddbc4ce1b72bb3b3653e7f4f7a/pyodbc-5.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6132554ffbd7910524d643f13ce17f4a72f3a6824b0adef4e9a7f66efac96350", size = 72541 }, + { url = "https://files.pythonhosted.org/packages/1b/33/2bb24e7fc95e98a7b11ea5ad1f256412de35d2e9cc339be198258c1d9a76/pyodbc-5.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1629af4706e9228d79dabb4863c11cceb22a6dab90700db0ef449074f0150c0d", size = 343287 }, + { url = "https://files.pythonhosted.org/packages/fa/24/88cde8b6dc07a93a92b6c15520a947db24f55db7bd8b09e85956642b7cf3/pyodbc-5.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ceaed87ba2ea848c11223f66f629ef121f6ebe621f605cde9cfdee4fd9f4b68", size = 350094 }, + { url = "https://files.pythonhosted.org/packages/c2/99/53c08562bc171a618fa1699297164f8885e66cde38c3b30f454730d0c488/pyodbc-5.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3cc472c8ae2feea5b4512e23b56e2b093d64f7cbc4b970af51da488429ff7818", size = 1301029 }, + { url = "https://files.pythonhosted.org/packages/d8/10/68a0b5549876d4b53ba4c46eed2a7aca32d589624ed60beef5bd7382619e/pyodbc-5.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c79df54bbc25bce9f2d87094e7b39089c28428df5443d1902b0cc5f43fd2da6f", size = 1361420 }, + { url = "https://files.pythonhosted.org/packages/41/0f/9dfe4987283ffcb981c49a002f0339d669215eb4a3fe4ee4e14537c52852/pyodbc-5.3.0-cp313-cp313-win32.whl", hash = "sha256:c2eb0b08e24fe5c40c7ebe9240c5d3bd2f18cd5617229acee4b0a0484dc226f2", size = 63399 }, + { url = "https://files.pythonhosted.org/packages/56/03/15dcefe549d3888b649652af7cca36eda97c12b6196d92937ca6d11306e9/pyodbc-5.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:01166162149adf2b8a6dc21a212718f205cabbbdff4047dc0c415af3fd85867e", size = 70133 }, + { url = "https://files.pythonhosted.org/packages/c4/c1/c8b128ae59a14ecc8510e9b499208e342795aecc3af4c3874805c720b8db/pyodbc-5.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:363311bd40320b4a61454bebf7c38b243cd67c762ed0f8a5219de3ec90c96353", size = 64683 }, +] + [[package]] name = "pyparsing" version = "3.2.3"