diff --git a/uc-quickstart/utils/abac/finance/0.1finance_abac_functions.sql b/uc-quickstart/utils/abac/finance/0.1finance_abac_functions.sql new file mode 100644 index 00000000..4af9c48c --- /dev/null +++ b/uc-quickstart/utils/abac/finance/0.1finance_abac_functions.sql @@ -0,0 +1,260 @@ +-- ============================================= +-- DATABRICKS UNITY CATALOG ABAC MASKING FUNCTIONS - FINANCE DOMAIN +-- Purpose: Attribute-Based Access Control (ABAC) utility functions for financial services data masking +-- Compliance: PCI-DSS, AML/KYC, GDPR, SOX, GLBA +-- Reference: https://docs.databricks.com/aws/en/data-governance/unity-catalog/abac/ +-- ============================================= + +-- Set catalog and schema context +USE CATALOG fincat; +USE SCHEMA finance; + +-- ============================================= +-- MASKING FUNCTIONS (11 total) +-- These transform/hide data values while preserving table structure +-- ============================================= + +-- ============================================= +-- 1. CREDIT CARD FULL MASKING FUNCTION +-- Purpose: Complete masking of credit card numbers for PCI-DSS compliance +-- Usage: Customer service representatives with basic clearance +-- Input: Credit card number (e.g., 4532-1234-5678-9010) +-- Output: Fully masked (XXXX-XXXX-XXXX-XXXX) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_credit_card_full(card_number STRING) +RETURNS STRING +COMMENT 'ABAC utility: Full credit card masking for PCI-DSS compliance' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- ============================================= +-- 2. CREDIT CARD LAST 4 DIGITS FUNCTION +-- Purpose: Show only last 4 digits for customer service verification +-- Usage: Customer service and fraud detection teams +-- Input: Credit card number (e.g., 4532-1234-5678-9010) +-- Output: Masked with last 4 visible (XXXX-XXXX-XXXX-9010) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_credit_card_last4(card_number STRING) +RETURNS STRING +COMMENT 'ABAC utility: Show last 4 digits of credit card for verification' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + WHEN LENGTH(REGEXP_REPLACE(card_number, '[^0-9]', '')) >= 4 THEN + CONCAT('XXXX-XXXX-XXXX-', RIGHT(REGEXP_REPLACE(card_number, '[^0-9]', ''), 4)) + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- ============================================= +-- 3. SSN MASKING FUNCTION +-- Purpose: Mask Social Security Numbers while showing last 4 for verification +-- Usage: Customer service and compliance teams +-- Input: SSN (e.g., 123-45-6789) +-- Output: Masked SSN (XXX-XX-6789) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_ssn(ssn STRING) +RETURNS STRING +COMMENT 'ABAC utility: Mask SSN showing only last 4 digits for GLBA compliance' +RETURN CASE + WHEN ssn IS NULL OR ssn = '' THEN ssn + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) = 9 THEN + CONCAT('XXX-XX-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE 'XXX-XX-XXXX' +END; + +-- ============================================= +-- 4. ACCOUNT NUMBER TOKENIZATION FUNCTION +-- Purpose: Deterministic masking of account numbers for analytics +-- Usage: Data analysts and reporting teams +-- Input: Account number (e.g., ACC123456) +-- Output: Deterministic token (e.g., ACCT_a3f9c2...) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_account_number(account_id STRING) +RETURNS STRING +COMMENT 'ABAC utility: Deterministic account number tokenization for cross-table analytics' +RETURN CASE + WHEN account_id IS NULL OR account_id = '' THEN account_id + ELSE CONCAT('ACCT_', LEFT(SHA2(account_id, 256), 12)) +END; + +-- ============================================= +-- 5. EMAIL MASKING FOR FINANCE FUNCTION +-- Purpose: Mask customer email addresses for privacy +-- Usage: Marketing and customer service teams +-- Input: Email (e.g., john.doe@example.com) +-- Output: Masked email (****@example.com) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_email_finance(email STRING) +RETURNS STRING +COMMENT 'ABAC utility: Mask email local part while preserving domain for GDPR compliance' +RETURN CASE + WHEN email IS NULL OR email = '' THEN email + WHEN LOCATE('@', email) > 0 THEN + CONCAT('****', SUBSTRING(email, LOCATE('@', email))) + ELSE '****' +END; + +-- ============================================= +-- 6. CUSTOMER ID DETERMINISTIC MASKING FUNCTION +-- Purpose: Hash customer IDs for referential integrity in analytics +-- Usage: Data scientists and analysts performing cross-table joins +-- Input: Customer ID (e.g., CUST00123) +-- Output: Deterministic reference (e.g., REF_c8a9f...) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_customer_id_deterministic(customer_id STRING) +RETURNS STRING +COMMENT 'ABAC utility: Deterministic customer ID masking preserving join capability' +RETURN CASE + WHEN customer_id IS NULL OR customer_id = '' THEN customer_id + ELSE CONCAT('REF_', LEFT(SHA2(customer_id, 256), 10)) +END; + +-- ============================================= +-- 7. TRANSACTION AMOUNT ROUNDING FUNCTION +-- Purpose: Round transaction amounts for aggregated reporting +-- Usage: Marketing teams and external partners +-- Input: Amount (e.g., 1234.56) +-- Output: Rounded amount (1200.00) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_amount_rounded(amount DECIMAL(18,2)) +RETURNS DECIMAL(18,2) +COMMENT 'ABAC utility: Round amounts to nearest hundred for aggregated analytics' +RETURN CASE + WHEN amount IS NULL THEN NULL + WHEN amount < 100 THEN ROUND(amount, -1) -- Round to nearest 10 + ELSE ROUND(amount, -2) -- Round to nearest 100 +END; + +-- ============================================= +-- 8. PII STRING PARTIAL MASKING FUNCTION +-- Purpose: Show only first and last characters of PII fields +-- Usage: Customer names and addresses for partial visibility +-- Input: String value (e.g., "John") +-- Output: Partially masked string (e.g., "J**n") +-- ============================================= +CREATE OR REPLACE FUNCTION mask_pii_partial(input STRING) +RETURNS STRING +COMMENT 'ABAC utility: Partial PII masking showing first and last characters for GDPR' +RETURN CASE + WHEN input IS NULL OR input = '' THEN input + WHEN LENGTH(input) <= 2 THEN REPEAT('*', LENGTH(input)) + WHEN LENGTH(input) = 3 THEN CONCAT(LEFT(input, 1), '*', RIGHT(input, 1)) + ELSE CONCAT(LEFT(input, 1), REPEAT('*', LENGTH(input) - 2), RIGHT(input, 1)) +END; + +-- ============================================= +-- ROW FILTER FUNCTIONS (Zero-argument for Unity Catalog ABAC) +-- These control which rows are visible to users based on group membership +-- Note: UC ROW FILTER policies require 0-argument functions +-- ============================================= + +-- ============================================= +-- 9. TRADING HOURS TIME-BASED FILTER +-- Purpose: Restrict access to trading positions during market hours +-- Usage: Prevent risk managers from accessing live positions during trading +-- Input: None (uses current time) +-- Output: Boolean indicating if access is allowed (outside trading hours 9:30 AM - 4:00 PM ET) +-- ============================================= +CREATE OR REPLACE FUNCTION filter_trading_hours() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Time-based access control for trading positions outside market hours' +RETURN + -- Allow access outside NYSE trading hours (9:30 AM - 4:00 PM ET) + -- Convert to UTC: 9:30 AM ET = 14:30 UTC, 4:00 PM ET = 21:00 UTC (EST) + -- Note: Adjust for daylight saving time in production + CASE + WHEN hour(current_timestamp()) < 14 OR hour(current_timestamp()) >= 21 THEN TRUE + ELSE FALSE + END; + +-- ============================================= +-- 10. INFORMATION BARRIER FILTER (Chinese Wall) +-- Purpose: Block research analysts from trading data +-- Usage: Enforce SEC/MiFID II Chinese wall for research analysts +-- Input: None (checks current user group membership) +-- Output: Boolean - FALSE blocks access for Research_Analyst group +-- ============================================= +CREATE OR REPLACE FUNCTION filter_information_barrier() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Chinese wall - block research analysts from trading positions' +RETURN + -- Research analysts are blocked (return FALSE to deny access) + -- This function is applied only to tables tagged with information_barrier + -- Risk managers and compliance have Neutral access (not blocked) + TRUE; -- Default allow - policy applies this selectively via WHEN clause + +-- ============================================= +-- 11. AML CLEARANCE FILTER +-- Purpose: Hide flagged/high-risk transactions from junior analysts +-- Usage: Junior AML analysts cannot see flagged transactions +-- Input: None (checks current user group membership) +-- Output: Boolean - controls visibility of sensitive AML data +-- ============================================= +CREATE OR REPLACE FUNCTION filter_aml_clearance() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Hide flagged transactions from junior AML analysts' +RETURN + -- Junior analysts blocked from flagged transactions + -- Senior investigators and compliance see all + TRUE; -- Default allow - policy WHEN clause controls application + +-- ============================================= +-- 12. REGIONAL DATA RESIDENCY FILTER - EU +-- Purpose: Show only EU customer data to EU staff +-- Usage: GDPR compliance - EU staff see EU data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_eu() +RETURNS BOOLEAN +COMMENT 'ABAC utility: GDPR - EU regional staff see EU customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='EU' tables + +-- ============================================= +-- 13. REGIONAL DATA RESIDENCY FILTER - US +-- Purpose: Show only US customer data to US staff +-- Usage: CCPA/GLBA compliance - US staff see US data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_us() +RETURNS BOOLEAN +COMMENT 'ABAC utility: CCPA/GLBA - US regional staff see US customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='US' tables + +-- ============================================= +-- 14. REGIONAL DATA RESIDENCY FILTER - APAC +-- Purpose: Show only APAC customer data to APAC staff +-- Usage: PDPA compliance - APAC staff see APAC data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_apac() +RETURNS BOOLEAN +COMMENT 'ABAC utility: PDPA - APAC regional staff see APAC customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='APAC' tables + +-- ============================================= +-- 15. TEMPORARY AUDITOR ACCESS FILTER +-- Purpose: Grant access to external auditors (always allow within policy scope) +-- Usage: SOX compliance - external auditors with temporary access +-- Input: None (group membership determines access) +-- Output: Boolean indicating if access is allowed +-- ============================================= +CREATE OR REPLACE FUNCTION filter_audit_expiry() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Temporary access control for external auditors (SOX compliance)' +RETURN TRUE; -- Applied via WHEN clause with audit_project tag + +-- ============================================= +-- VERIFICATION AND TESTING +-- ============================================= + +-- List all created functions +SHOW FUNCTIONS IN finance LIKE 'mask*'; +SHOW FUNCTIONS IN finance LIKE 'filter*'; + +SELECT 'āœ… Successfully created 15 finance ABAC functions (8 masking, 7 row filters)' as status; +SELECT 'šŸ“‹ Row filter functions are zero-argument for Unity Catalog ABAC policies' as note; +SELECT 'šŸ” Functions ready for: PCI-DSS, AML/KYC, GDPR, SOX, GLBA compliance' as compliance_frameworks; diff --git a/uc-quickstart/utils/abac/finance/0.2finance_database_schema.sql b/uc-quickstart/utils/abac/finance/0.2finance_database_schema.sql new file mode 100644 index 00000000..0b7eaa44 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/0.2finance_database_schema.sql @@ -0,0 +1,403 @@ +-- ============================================= +-- DATABRICKS UNITY CATALOG - FINANCE DOMAIN DATABASE SCHEMA +-- Purpose: Create comprehensive financial services database for ABAC demonstrations +-- Compliance: PCI-DSS, AML/KYC, GDPR, SOX, GLBA +-- Tables: Customers, Accounts, Transactions, CreditCards, TradingPositions, AMLAlerts, AuditLogs +-- ============================================= + +USE CATALOG fincat; + +USE SCHEMA finance; + +-- ============================================= +-- TABLE 1: CUSTOMERS +-- Purpose: Core customer master data with PII +-- Compliance: GDPR, GLBA, CCPA +-- ============================================= +DROP TABLE IF EXISTS Customers; + +CREATE TABLE Customers ( + CustomerID STRING NOT NULL, + FirstName STRING, + LastName STRING, + Email STRING, + SSN STRING COMMENT 'Social Security Number - PII/Sensitive', + DateOfBirth DATE, + Address STRING, + City STRING, + State STRING, + ZipCode STRING, + CustomerRegion STRING COMMENT 'Data residency region: EU, US, APAC, LATAM', + AccountOpenDate DATE, + CustomerStatus STRING COMMENT 'Active, Suspended, Closed', + RiskScore INT COMMENT 'AML risk score 1-100', + KYCVerificationDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Customer master data with PII for GDPR/GLBA compliance demonstrations' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +-- Insert sample customer data +INSERT INTO Customers VALUES + ('CUST00001', 'John', 'Smith', 'john.smith@email.com', '123-45-6789', '1975-03-15', '123 Main St', 'New York', 'NY', '10001', 'US', '2020-01-15', 'Active', 25, '2020-01-10', CURRENT_TIMESTAMP()), + ('CUST00002', 'Maria', 'Garcia', 'maria.garcia@email.com', '234-56-7890', '1982-07-22', '456 Oak Ave', 'Los Angeles', 'CA', '90001', 'US', '2019-05-20', 'Active', 15, '2019-05-15', CURRENT_TIMESTAMP()), + ('CUST00003', 'Hans', 'Mueller', 'hans.mueller@email.de', '345-67-8901', '1990-11-08', 'Berliner Str 78', 'Berlin', 'BE', '10115', 'EU', '2021-03-10', 'Active', 10, '2021-03-05', CURRENT_TIMESTAMP()), + ('CUST00004', 'Sophie', 'Dubois', 'sophie.dubois@email.fr', '456-78-9012', '1988-02-14', '12 Rue de Paris', 'Paris', 'IDF', '75001', 'EU', '2020-08-25', 'Active', 20, '2020-08-20', CURRENT_TIMESTAMP()), + ('CUST00005', 'Wei', 'Chen', 'wei.chen@email.cn', '567-89-0123', '1985-09-30', '88 Nanjing Rd', 'Shanghai', 'SH', '200001', 'APAC', '2021-11-12', 'Active', 30, '2021-11-10', CURRENT_TIMESTAMP()), + ('CUST00006', 'Sarah', 'Johnson', 'sarah.j@email.com', '678-90-1234', '1992-05-18', '789 Pine St', 'Chicago', 'IL', '60601', 'US', '2022-02-14', 'Active', 12, '2022-02-10', CURRENT_TIMESTAMP()), + ('CUST00007', 'Carlos', 'Silva', 'carlos.silva@email.br', '789-01-2345', '1978-12-03', 'Av Paulista 1000', 'Sao Paulo', 'SP', '01310', 'LATAM', '2019-09-08', 'Active', 45, '2019-09-05', CURRENT_TIMESTAMP()), + ('CUST00008', 'Yuki', 'Tanaka', 'yuki.tanaka@email.jp', '890-12-3456', '1995-06-25', '1-1-1 Shibuya', 'Tokyo', 'TK', '150-0001', 'APAC', '2022-07-19', 'Active', 8, '2022-07-15', CURRENT_TIMESTAMP()), + ('CUST00009', 'Emma', 'Wilson', 'emma.wilson@email.co.uk', '901-23-4567', '1987-04-12', '10 Downing St', 'London', 'LDN', 'SW1A', 'EU', '2020-12-05', 'Suspended', 75, '2020-12-01', CURRENT_TIMESTAMP()), + ('CUST00010', 'Ahmed', 'Al-Saud', 'ahmed.alsaud@email.sa', '012-34-5678', '1983-08-20', 'King Fahd Rd', 'Riyadh', 'RY', '11564', 'APAC', '2021-06-30', 'Active', 55, '2021-06-25', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 2: ACCOUNTS +-- Purpose: Bank accounts linked to customers +-- Compliance: GLBA, regional banking regulations +-- ============================================= +DROP TABLE IF EXISTS Accounts; + +CREATE TABLE Accounts ( + AccountID STRING NOT NULL, + CustomerID STRING NOT NULL, + AccountType STRING COMMENT 'Checking, Savings, Investment, Credit', + Balance DECIMAL(18,2), + Currency STRING DEFAULT 'USD', + OpenDate DATE, + AccountStatus STRING COMMENT 'Active, Frozen, Closed', + AccountRegion STRING COMMENT 'Region where account is held', + InterestRate DECIMAL(5,4), + LastTransactionDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Bank account information for balance and transaction tracking' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO Accounts VALUES + ('ACC1001', 'CUST00001', 'Checking', 15234.50, 'USD', '2020-01-15', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1002', 'CUST00001', 'Savings', 45678.90, 'USD', '2020-01-15', 'Active', 'US', 0.0350, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1003', 'CUST00002', 'Checking', 8945.75, 'USD', '2019-05-20', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1004', 'CUST00003', 'Checking', 12456.30, 'EUR', '2021-03-10', 'Active', 'EU', 0.0100, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1005', 'CUST00003', 'Investment', 78900.00, 'EUR', '2021-06-15', 'Active', 'EU', 0.0000, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1006', 'CUST00004', 'Savings', 23567.85, 'EUR', '2020-08-25', 'Active', 'EU', 0.0300, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1007', 'CUST00005', 'Checking', 34567.20, 'CNY', '2021-11-12', 'Active', 'APAC', 0.0200, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1008', 'CUST00006', 'Checking', 5678.40, 'USD', '2022-02-14', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1009', 'CUST00007', 'Savings', 67890.50, 'BRL', '2019-09-08', 'Active', 'LATAM', 0.0650, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1010', 'CUST00009', 'Checking', 2345.60, 'GBP', '2020-12-05', 'Frozen', 'EU', 0.0150, '2026-02-08', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 3: TRANSACTIONS (RECREATED FOR FRAUD AI DEMO) +-- Purpose: Transaction history for AML monitoring + AI reasoning +-- Compliance: AML/KYC, FATF, FinCEN +-- ============================================= + +DROP TABLE IF EXISTS Transactions; + +CREATE TABLE Transactions ( + TransactionID STRING NOT NULL, + AccountID STRING NOT NULL, + TransactionDate TIMESTAMP, + Amount DECIMAL(18,2), + Currency STRING DEFAULT 'USD', + TransactionType STRING COMMENT 'Deposit, Withdrawal, Transfer, Payment', + CountryCode STRING COMMENT 'Country where transaction originated', + MerchantName STRING, + TransactionStatus STRING COMMENT 'Completed, Pending, Flagged, Blocked', + AMLFlagReason STRING COMMENT 'Large transaction, Cross-border, Suspicious pattern', + + -- Added for AI-driven fraud explanation + IsInternational BOOLEAN COMMENT 'TRUE if transaction is cross-border', + ExceedsHighRiskThreshold BOOLEAN COMMENT 'TRUE if amount exceeds high-risk threshold (e.g. >= 10000)', + + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Transaction history for AML/KYC monitoring and fraud investigation with AI context' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO Transactions VALUES +-- Normal domestic payments +('TXN000001', 'ACC1001', '2026-02-08 14:35:22', 234.50, 'USD', 'Payment', 'US', 'Amazon.com', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000002', 'ACC1001', '2026-02-08 09:12:45', 1500.00, 'USD', 'Deposit', 'US', 'Payroll Direct Deposit', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000008', 'ACC1002', '2026-02-08 10:15:55', 500.00, 'USD', 'Payment', 'US', 'Utility Bill', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000010', 'ACC1008', '2026-02-08 12:05:33', 78.90, 'USD', 'Payment', 'US', 'Coffee Shop', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), + +-- Large but explainable withdrawals (kept) +('TXN000003', 'ACC1003', '2026-02-08 16:20:10', 15000.00, 'USD', 'Withdrawal', 'US', 'Cash Withdrawal ATM', 'Flagged', 'Large transaction', FALSE, TRUE, CURRENT_TIMESTAMP()), + +-- Existing international transfers (kept) +('TXN000004', 'ACC1004', '2026-02-08 11:45:30', 8500.00, 'EUR', 'Transfer', 'DE', 'International Wire', 'Completed', NULL, TRUE, FALSE, CURRENT_TIMESTAMP()), +('TXN000005', 'ACC1007', '2026-02-08 08:30:15', 25000.00, 'CNY', 'Transfer', 'CN', 'Business Payment', 'Completed', NULL, TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- High-risk cash activity (kept) +('TXN000006', 'ACC1009', '2026-02-08 19:55:40', 45000.00, 'BRL', 'Deposit', 'BR', 'Large Cash Deposit', 'Flagged', 'Large transaction', FALSE, TRUE, CURRENT_TIMESTAMP()), + +-- Existing blocked transfer (kept) +('TXN000007', 'ACC1010', '2026-02-08 14:22:18', 12000.00, 'GBP', 'Transfer', 'GB', 'Suspicious Transfer', 'Blocked', 'Suspicious pattern', TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- Investment-related transfer (kept) +('TXN000009', 'ACC1005', '2026-02-08 15:40:25', 12500.00, 'EUR', 'Transfer', 'FR', 'Investment Purchase', 'Completed', NULL, TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- ============================================= +-- DEMO: TWO TOP URGENT ALERT TRANSACTIONS (NEW) +-- ============================================= + +-- āœ… DEMO #1 (Customer aware / reasonable): large first-time international transfer for CUST00001 +('TXN_DEMO_01', 'ACC1001', '2026-02-08 08:30:00', 18000.00, 'USD', 'Transfer', 'DE', 'International Wire - Property Settlement', 'Flagged', 'Cross-border', TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- 🚨 DEMO #2 (Customer unreachable): large international transfer for CUST00009 (already Frozen account ACC1010) +('TXN_DEMO_02', 'ACC1010', '2026-02-08 08:40:00', 22000.00, 'GBP', 'Transfer', 'GB', 'International Wire - Beneficiary Added Recently', 'Blocked', 'Suspicious pattern', TRUE, TRUE, CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 4: CREDIT CARDS +-- Purpose: Credit card information for PCI-DSS compliance +-- Compliance: PCI-DSS +-- ============================================= +DROP TABLE IF EXISTS CreditCards; + +CREATE TABLE CreditCards ( + CardID STRING NOT NULL, + CustomerID STRING NOT NULL, + CardNumber STRING COMMENT 'Full card number - PCI-DSS Sensitive', + CVV STRING COMMENT 'Card Verification Value - PCI-DSS Sensitive', + ExpirationDate STRING, + CardType STRING COMMENT 'Visa, Mastercard, Amex, Discover', + CardStatus STRING COMMENT 'Active, Blocked, Expired', + CreditLimit DECIMAL(18,2), + CurrentBalance DECIMAL(18,2), + LastUsedDate DATE, + IssueDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Credit card master data for PCI-DSS compliance demonstrations' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO CreditCards VALUES + ('CARD0001', 'CUST00001', '4532-1234-5678-9010', '123', '12/2026', 'Visa', 'Active', 10000.00, 2345.60, '2026-02-08', '2020-01-15', CURRENT_TIMESTAMP()), + ('CARD0002', 'CUST00002', '5425-2345-6789-0123', '456', '06/2025', 'Mastercard', 'Active', 5000.00, 1234.50, '2026-02-08', '2019-05-20', CURRENT_TIMESTAMP()), + ('CARD0003', 'CUST00003', '3782-456789-01234', '789', '09/2027', 'Amex', 'Active', 15000.00, 5678.90, '2026-02-08', '2021-03-10', CURRENT_TIMESTAMP()), + ('CARD0004', 'CUST00004', '6011-3456-7890-1234', '234', '03/2026', 'Discover', 'Active', 8000.00, 3456.70, '2026-02-08', '2020-08-25', CURRENT_TIMESTAMP()), + ('CARD0005', 'CUST00005', '4916-4567-8901-2345', '567', '11/2025', 'Visa', 'Active', 12000.00, 4567.80, '2026-02-08', '2021-11-12', CURRENT_TIMESTAMP()), + ('CARD0006', 'CUST00006', '5500-5678-9012-3456', '890', '05/2026', 'Mastercard', 'Active', 3000.00, 567.90, '2026-02-08', '2022-02-14', CURRENT_TIMESTAMP()), + ('CARD0007', 'CUST00007', '4485-6789-0123-4567', '321', '08/2027', 'Visa', 'Active', 20000.00, 12345.00, '2026-02-08', '2019-09-08', CURRENT_TIMESTAMP()), + ('CARD0008', 'CUST00009', '5425-7890-1234-5678', '654', '02/2024', 'Mastercard', 'Blocked', 7000.00, 6789.50, '2026-02-08', '2020-12-05', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 5: TRADING POSITIONS +-- Purpose: Trading desk positions for Chinese wall enforcement +-- Compliance: SEC, MiFID II, insider trading prevention +-- ============================================= +DROP TABLE IF EXISTS TradingPositions; + +CREATE TABLE TradingPositions ( + PositionID STRING NOT NULL, + TraderID STRING NOT NULL COMMENT 'User ID of trader', + SecurityID STRING NOT NULL COMMENT 'Stock ticker or security identifier', + SecurityName STRING, + Quantity INT, + EntryPrice DECIMAL(18,4), + CurrentPrice DECIMAL(18,4), + PnL DECIMAL(18,2) COMMENT 'Profit and Loss', + TradingDesk STRING COMMENT 'Equity, Fixed_Income, FX, Commodities', + PositionDate DATE, + PositionStatus STRING COMMENT 'Open, Closed', + InformationBarrier STRING COMMENT 'Trading_Side, Advisory_Side, Neutral', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Trading positions for Chinese wall and insider trading prevention' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO TradingPositions VALUES + ('POS00001', 'TRADER001', 'AAPL', 'Apple Inc', 1000, 150.25, 175.50, 25250.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00002', 'TRADER001', 'GOOGL', 'Alphabet Inc', 500, 2800.00, 2950.75, 75375.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00003', 'TRADER002', 'TSLA', 'Tesla Inc', 2000, 185.50, 165.25, -40500.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00004', 'TRADER003', 'US10Y', 'US 10-Year Treasury', 10000000, 98.50, 99.25, 75000.00, 'Fixed_Income', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00005', 'TRADER004', 'EURUSD', 'Euro/US Dollar', 5000000, 1.0850, 1.0920, 35000.00, 'FX', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00006', 'TRADER005', 'GC', 'Gold Futures', 100, 2050.00, 2075.50, 2550.00, 'Commodities', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 6: AML ALERTS +-- Purpose: Anti-Money Laundering alert management +-- Compliance: AML/KYC, FATF, FinCEN +-- ============================================= +DROP TABLE IF EXISTS AMLAlerts; + +CREATE TABLE AMLAlerts ( + AlertID STRING NOT NULL, + CustomerID STRING NOT NULL, + TransactionID STRING, + AlertDate TIMESTAMP, + AlertType STRING COMMENT 'Large Transaction, Structuring, Cross-Border, Rapid Movement', + RiskScore INT COMMENT '1-100 risk assessment', + InvestigationStatus STRING COMMENT 'New, Under Review, Escalated, Cleared, SAR Filed', + AssignedInvestigator STRING, + InvestigationNotes STRING COMMENT 'Sensitive investigation details', + ResolutionDate TIMESTAMP, + SARFiled BOOLEAN COMMENT 'Suspicious Activity Report filed with FinCEN', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'AML alerts and investigation tracking for compliance monitoring' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO AMLAlerts VALUES +-- āœ… DEMO #1 (Customer aware) - still urgent but slightly lower than DEMO #2 +( + 'AML_DEMO_01', + 'CUST00001', + 'TXN_DEMO_01', + '2026-02-08 09:00:00', + 'Cross-Border', + 88, + 'Under Review', + 'AML_INV_DEMO', + 'First-time large international transfer flagged by threshold and cross-border controls', + NULL, + FALSE, + CURRENT_TIMESTAMP() +), + +-- 🚨 DEMO #2 (Customer unreachable) - highest urgency +( + 'AML_DEMO_02', + 'CUST00009', + 'TXN_DEMO_02', + '2026-02-08 09:05:00', + 'Cross-Border', + 92, + 'Under Review', + 'AML_INV_DEMO', + 'Large international transfer blocked; account is frozen and customer could not be reached', + NULL, + FALSE, + CURRENT_TIMESTAMP() +); +-- ============================================= +-- TABLE 7: AUDIT LOGS +-- Purpose: Audit trail for SOX compliance +-- Compliance: SOX, regulatory audit requirements +-- ============================================= +DROP TABLE IF EXISTS AuditLogs; + +CREATE TABLE AuditLogs ( + LogID STRING NOT NULL, + UserID STRING NOT NULL, + UserRole STRING, + AccessTime TIMESTAMP, + TableAccessed STRING, + OperationType STRING COMMENT 'SELECT, INSERT, UPDATE, DELETE', + RecordsAffected INT, + AuditProject STRING COMMENT 'Q1_SOX_Audit, Annual_Financial_Audit, Regulatory_Review', + AccessGrantedUntil DATE COMMENT 'Temporary access expiration date', + IPAddress STRING, + SessionID STRING, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Audit log for access tracking and SOX compliance' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO AuditLogs VALUES + ('LOG00001', 'auditor@external.com', 'External_Auditor', '2026-02-08 10:30:00', 'Accounts', 'SELECT', 150, 'Q1_SOX_Audit', '2026-02-08', '203.0.113.25', 'SESS_A1B2C3', CURRENT_TIMESTAMP()), + ('LOG00002', 'compliance@company.com', 'Compliance_Officer', '2026-02-08 14:20:00', 'AMLAlerts', 'SELECT', 45, 'Regulatory_Review', '2026-02-08', '198.51.100.42', 'SESS_D4E5F6', CURRENT_TIMESTAMP()), + ('LOG00003', 'analyst@company.com', 'AML_Investigator_Senior', '2026-02-08 09:15:00', 'Transactions', 'SELECT', 8932, NULL, '2026-02-08', '192.0.2.15', 'SESS_G7H8I9', CURRENT_TIMESTAMP()), + ('LOG00004', 'support@company.com', 'Credit_Card_Support', '2026-02-08 11:45:00', 'CreditCards', 'SELECT', 23, NULL, '2026-02-08', '198.51.100.87', 'SESS_J1K2L3', CURRENT_TIMESTAMP()); + +DROP TABLE IF EXISTS CustomerInteractions; + +CREATE TABLE CustomerInteractions ( + InteractionID STRING NOT NULL, + CustomerID STRING NOT NULL, + InteractionTime TIMESTAMP, + Channel STRING COMMENT 'Call, Chat, Email', + AgentID STRING, + InteractionNotes STRING COMMENT 'Free-text customer interaction notes', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Customer interaction history used for fraud investigation context' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO CustomerInteractions VALUES +-- āœ… Customer aware -> approve/monitor +( + 'INT_DEMO_01', + 'CUST00001', + '2026-02-08 08:45:00', + 'Call', + 'AGENT_101', + 'Customer confirmed the international transfer was intentional and related to an overseas property purchase. Customer acknowledged the amount and destination account.', + CURRENT_TIMESTAMP() +), + +-- 🚨 Customer unreachable -> escalate +( + 'INT_DEMO_02', + 'CUST00009', + '2026-02-08 08:50:00', + 'Call', + 'AGENT_102', + 'Multiple attempts were made to contact the customer regarding the international transfer. No response was received and the customer could not be reached.', + CURRENT_TIMESTAMP() +); + +-- ============================================= +-- VERIFICATION +-- ============================================= + +-- Show all created tables +SHOW TABLES IN finance; + +-- Display row counts +SELECT 'Customers' as table_name, COUNT(*) as row_count FROM Customers +UNION ALL +SELECT 'Accounts', COUNT(*) FROM Accounts +UNION ALL +SELECT 'Transactions', COUNT(*) FROM Transactions +UNION ALL +SELECT 'CreditCards', COUNT(*) FROM CreditCards +UNION ALL +SELECT 'TradingPositions', COUNT(*) FROM TradingPositions +UNION ALL +SELECT 'AMLAlerts', COUNT(*) FROM AMLAlerts +UNION ALL +SELECT 'AuditLogs', COUNT(*) FROM AuditLogs +ORDER BY table_name; + +SELECT 'āœ… Successfully created 7 finance tables with sample data' as status; +SELECT 'šŸ“Š Tables: Customers, Accounts, Transactions, CreditCards, TradingPositions, AMLAlerts, AuditLogs' as tables_created; +SELECT 'šŸ” Ready for: PCI-DSS, AML/KYC, GDPR, SOX, GLBA compliance demonstrations' as compliance_ready; + + +-- Show the two top urgent alerts +SELECT + a.AlertID, + a.AlertDate, + a.RiskScore, + a.InvestigationStatus, + a.CustomerID, + a.TransactionID +FROM AMLAlerts a +ORDER BY a.RiskScore DESC, a.AlertDate DESC; + +-- Verify both demo transactions exist and are international + exceed threshold +SELECT + TransactionID, + AccountID, + TransactionDate, + Amount, + Currency, + CountryCode, + TransactionStatus, + AMLFlagReason, + IsInternational, + ExceedsHighRiskThreshold +FROM Transactions +WHERE TransactionID IN ('TXN_DEMO_01', 'TXN_DEMO_02') +ORDER BY TransactionDate; + +-- Verify interactions exist for both customers +SELECT + CustomerID, + InteractionTime, + Channel, + AgentID, + InteractionNotes +FROM CustomerInteractions +ORDER BY InteractionTime DESC; diff --git a/uc-quickstart/utils/abac/finance/1.CreateFinanceGroups.py b/uc-quickstart/utils/abac/finance/1.CreateFinanceGroups.py new file mode 100644 index 00000000..a2bd0f90 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/1.CreateFinanceGroups.py @@ -0,0 +1,315 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # šŸ‘„ Finance ABAC Account Groups Setup +# MAGIC +# MAGIC ## šŸ“‹ Overview +# MAGIC This notebook creates all the required **account-level user groups** for finance ABAC scenarios using Databricks Account SCIM API. +# MAGIC +# MAGIC ### šŸŽÆ Groups to Create (5 Total – Minimal Demo) +# MAGIC **Primary:** Use Terraform (genie/aws) to create groups. This script is optional/backup. +# MAGIC 1. **Junior_Analyst** - Masked PII, last-4 card, rounded transaction amounts +# MAGIC 2. **Senior_Analyst** - Unmasked PII, full card, full transaction details +# MAGIC 3. **US_Region_Staff** - Row access limited to US customer data +# MAGIC 4. **EU_Region_Staff** - Row access limited to EU customer data +# MAGIC 5. **Compliance_Officer** - Full unmasked access (all regions, all columns) +# MAGIC +# MAGIC ## āš ļø Prerequisites +# MAGIC - **Must be run in Databricks workspace** (uses `dbutils` for token) +# MAGIC - **Account admin permissions** to create account-level groups +# MAGIC - Unity Catalog enabled workspace +# MAGIC +# MAGIC ## šŸ”§ API Notes +# MAGIC - Creates **account-level groups** using Account SCIM API +# MAGIC - Uses `/api/2.0/account/scim/v2/Groups` endpoint +# MAGIC - Groups will be available across all workspaces in the account +# MAGIC +# MAGIC --- + +# COMMAND ---------- + +# Import required libraries +import requests +import json +import os +from typing import List, Dict, Any + +# COMMAND ---------- + +# Configuration - Get from Databricks context +workspace_url = spark.conf.get("spark.databricks.workspaceUrl") +workspace_url = f"https://{workspace_url}" + +# Account domain is the workspace domain for account API +account_domain = workspace_url + +# Get token from Databricks context (when running in Databricks) +try: + token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() + print("āœ… Token retrieved from Databricks context") +except: + print("āŒ Could not retrieve token from Databricks context") + print("ā„¹ļø Make sure this notebook is running in a Databricks workspace") + raise Exception("Token retrieval failed - ensure notebook is running in Databricks") + +# Setup API headers for Account SCIM API +headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" +} + +# Use Account SCIM API endpoint for group management +account_scim_url = f"{account_domain}/api/2.0/account/scim/v2/Groups" + +print(f"🌐 Account SCIM URL: {account_scim_url}") +print(f"šŸ¦ Account Domain: {account_domain}") +print("āš ļø Note: Creating account-level groups requires account admin permissions") + +# COMMAND ---------- + +# Define finance user groups (minimal 5-group demo; Terraform is primary) +finance_groups = { + "Junior_Analyst": { + "display_name": "Junior Analyst", + "description": "Junior analysts with masked PII, last-4 card only, rounded transaction amounts", + "tags": ["aml_clearance:Junior_Analyst", "pii_level:Limited_PII", "pci_clearance:Basic"] + }, + "Senior_Analyst": { + "display_name": "Senior Analyst", + "description": "Senior analysts with unmasked PII, full card number, full transaction details", + "tags": ["aml_clearance:Senior_Investigator", "pii_level:Full_PII", "pci_clearance:Full"] + }, + "US_Region_Staff": { + "display_name": "US Region Staff", + "description": "Staff with row access limited to US customer data (GLBA, CCPA)", + "tags": ["data_residency:US", "customer_region:US"] + }, + "EU_Region_Staff": { + "display_name": "EU Region Staff", + "description": "Staff with row access limited to EU customer data (GDPR)", + "tags": ["data_residency:EU", "customer_region:EU"] + }, + "Compliance_Officer": { + "display_name": "Compliance Officer", + "description": "Full unmasked access to all regions and columns for audit", + "tags": ["aml_clearance:Compliance_Officer", "pci_clearance:Administrative"] + } +} + +print(f"šŸ“Š Prepared {len(finance_groups)} finance user groups for creation") +print("\nšŸ¦ Finance Groups:") +for group_name, details in finance_groups.items(): + print(f" • {group_name}: {details['description'][:60]}...") + +# COMMAND ---------- + +# Utility function to create an account-level group using Account SCIM API +def create_account_group(group_name: str, display_name: str, description: str) -> Dict[str, Any]: + """ + Create a Databricks account-level group using Account SCIM API + + Args: + group_name: The group name (used as displayName) + display_name: Human-readable display name (same as group_name) + description: Group description (for documentation) + + Returns: + API response as dictionary + """ + + # Check if group already exists using Account SCIM API + try: + list_response = requests.get(account_scim_url, headers=headers) + if list_response.status_code == 200: + existing_groups = list_response.json().get('Resources', []) + for group in existing_groups: + if group.get('displayName') == group_name: + print(f"ā„¹ļø Account group already exists: {group_name}") + print(f" šŸ“‹ Group ID: {group.get('id', 'Unknown')}") + return {"success": True, "message": "Group already exists", "action": "skipped", "group_id": group.get('id')} + except Exception as e: + print(f"āš ļø Could not check existing account groups: {str(e)}") + + # Create the group payload using Account SCIM format + create_payload = { + "schemas": ["urn:ietf:params:scim:schemas:core:2.0:Group"], + "displayName": group_name + } + + # Make the API call to create account-level group + try: + create_response = requests.post(account_scim_url, headers=headers, data=json.dumps(create_payload)) + + if create_response.status_code == 201: # SCIM returns 201 for creation + response_data = create_response.json() + group_id = response_data.get('id', 'Unknown') + print(f"āœ… Successfully created account group: {group_name}") + print(f" šŸ“‹ Group ID: {group_id}") + print(f" šŸ“ Display Name: {display_name}") + print(f" šŸ“„ Description: {description[:80]}{'...' if len(description) > 80 else ''}") + return {"success": True, "response": response_data, "action": "created", "group_id": group_id} + else: + print(f"āŒ Failed to create account group: {group_name}") + print(f" Status Code: {create_response.status_code}") + print(f" Response: {create_response.text}") + return {"success": False, "error": create_response.text, "action": "failed"} + + except Exception as e: + print(f"āŒ Exception creating account group {group_name}: {str(e)}") + return {"success": False, "error": str(e), "action": "failed"} + +# COMMAND ---------- + +# Create all finance account groups +print("šŸš€ Starting finance account group creation...\n") + +results = {} +success_count = 0 +skip_count = 0 +failure_count = 0 + +for group_name, config in finance_groups.items(): + print(f"\n{'='*60}") + print(f"Creating account group: {group_name}") + print(f"{'='*60}") + + result = create_account_group( + group_name=group_name, + display_name=config["display_name"], + description=config["description"] + ) + + results[group_name] = result + + if result["success"] and result["action"] == "created": + success_count += 1 + elif result["success"] and result["action"] == "skipped": + skip_count += 1 + else: + failure_count += 1 + + print() + +print(f"\n{'='*60}") +print("šŸ“Š ACCOUNT GROUP CREATION SUMMARY") +print(f"{'='*60}") +print(f"āœ… Successfully Created: {success_count}") +print(f"ā­ļø Already Existed: {skip_count}") +print(f"āŒ Failed: {failure_count}") +print(f"šŸ“Š Total Groups: {len(finance_groups)}") + +# Display created group IDs for reference +print(f"\nšŸ“‹ Created Group IDs:") +for group_name, result in results.items(): + if result.get("success") and "group_id" in result: + print(f" • {group_name}: {result['group_id']}") + +# COMMAND ---------- + +# Verify all account groups were created successfully +print("šŸ” Verifying created finance account groups...\n") + +try: + list_response = requests.get(account_scim_url, headers=headers) + + if list_response.status_code == 200: + all_groups = list_response.json().get('Resources', []) + group_names = [group.get('displayName') for group in all_groups] + + print(f"šŸ“‹ Total account groups: {len(all_groups)}") + print("\nšŸ¦ Finance account groups found:") + + finance_groups_found = [] + for group_name in finance_groups.keys(): + if group_name in group_names: + finance_groups_found.append(group_name) + # Find the group ID + group_id = next((g.get('id') for g in all_groups if g.get('displayName') == group_name), 'Unknown') + print(f" āœ… {group_name} (ID: {group_id})") + else: + print(f" āŒ {group_name} - NOT FOUND") + + print(f"\nšŸ“Š Finance account groups verification:") + print(f" • Found: {len(finance_groups_found)}/{len(finance_groups)}") + + if len(finance_groups_found) == len(finance_groups): + print("\nšŸŽ‰ All finance account groups created and verified successfully!") + print("\nāœ… Next Steps:") + print(" 1. Groups are now available across all workspaces in your account") + print(" 2. Assign users to groups via Databricks Admin Console or API") + print(" 3. Groups can be used in Unity Catalog ABAC policies") + print(" 4. Run 2.CreateFinanceTagPolicies.py to create tag policies") + print(" 5. Run 3.ApplyFinanceSetTags.sql to tag tables") + print(" 6. Run 4.CreateFinanceABACPolicies.sql to create ABAC policies") + else: + missing = set(finance_groups.keys()) - set(finance_groups_found) + print(f"\nāš ļø Missing groups: {missing}") + + else: + print(f"āŒ Failed to list account groups. Status: {list_response.status_code}") + print(f"Response: {list_response.text}") + + if list_response.status_code == 403: + print("\nšŸ’” Troubleshooting:") + print(" • Ensure you have account admin permissions") + print(" • Verify the token has account-level permissions") + print(" • Check if account SCIM API is enabled") + +except Exception as e: + print(f"āŒ Exception while listing account groups: {str(e)}") + +# COMMAND ---------- + +# Display group mapping to minimal 5 scenarios +print("\nšŸ“‹ Group to Scenario Mapping (Minimal Demo):\n") + +scenario_mapping = { + "1. PII masking": ["Junior_Analyst", "Senior_Analyst", "Compliance_Officer"], + "2. Fraud/card": ["Junior_Analyst", "Senior_Analyst", "Compliance_Officer"], + "3. Fraud/transactions": ["Junior_Analyst", "Senior_Analyst", "Compliance_Officer"], + "4. US region": ["US_Region_Staff"], + "5. EU region": ["EU_Region_Staff"] +} + +for scenario, groups in scenario_mapping.items(): + print(f"\n{scenario}") + print(f" Groups: {', '.join(groups)}") + for group in groups: + if group in finance_groups: + print(f" • {group}: {finance_groups[group]['description'][:60]}...") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## šŸŽÆ Next Steps After Account Group Creation +# MAGIC +# MAGIC ### āœ… **Account Groups Created Successfully** +# MAGIC All 5 finance account groups (minimal demo) are now available across all workspaces in your Databricks account: +# MAGIC +# MAGIC ### šŸ“‹ **Ready for ABAC Implementation:** +# MAGIC 1. **Apply Unity Catalog Tag Policies** - Run `2.CreateFinanceTagPolicies.py` +# MAGIC 2. **Tag Tables** - Run `3.ApplyFinanceSetTags.sql` +# MAGIC 3. **Deploy ABAC Policies** - Execute `4.CreateFinanceABACPolicies.sql` āœ… Will now work! +# MAGIC 4. **Assign Users to Groups** - Add users to appropriate account groups +# MAGIC 5. **Test Scenarios** - Validate policies with `5.TestFinanceABACPolicies.sql` +# MAGIC +# MAGIC ### šŸ‘„ **User Assignment Options:** +# MAGIC - **Databricks Account Console** - Assign users to account groups via Admin Console +# MAGIC - **Account SCIM API** - Programmatic user assignment to account groups +# MAGIC - **Identity Provider Integration** - Automated user provisioning via SSO +# MAGIC +# MAGIC ### šŸ” **ABAC Policy Binding:** +# MAGIC The ABAC policies in `4.CreateFinanceABACPolicies.sql` will now work with these account groups: +# MAGIC - Policies use `TO 'Group_Name'` syntax to bind to these account groups +# MAGIC - Tag-based conditions will evaluate account group membership +# MAGIC - Row filters and column masks will apply based on account group assignments +# MAGIC +# MAGIC ### šŸ“Š **Account vs Workspace Groups:** +# MAGIC - **Account Groups** (what we created): Available across all workspaces +# MAGIC - **Workspace Groups**: Local to individual workspaces only +# MAGIC - **Unity Catalog ABAC**: Works with both account and workspace groups +# MAGIC +# MAGIC ## šŸ¦ Finance ABAC Account Groups Ready! šŸŽ‰ +# MAGIC +# MAGIC Your Databricks account now has all the required groups for comprehensive financial services data governance using Unity Catalog ABAC policies across all workspaces. +# MAGIC +# MAGIC --- diff --git a/uc-quickstart/utils/abac/finance/2.CreateFinanceTagPolicies.py b/uc-quickstart/utils/abac/finance/2.CreateFinanceTagPolicies.py new file mode 100644 index 00000000..817b1452 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/2.CreateFinanceTagPolicies.py @@ -0,0 +1,405 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # šŸ·ļø Finance ABAC Tag Policies Creation +# MAGIC +# MAGIC This notebook creates comprehensive Unity Catalog tag policies for finance ABAC scenarios using Databricks REST API. +# MAGIC +# MAGIC ## šŸ“‹ Prerequisites +# MAGIC - Databricks workspace with Unity Catalog enabled +# MAGIC - Account admin or user with CREATE permission for tag policies +# MAGIC - Personal Access Token with appropriate permissions +# MAGIC +# MAGIC ## šŸŽÆ Tag Policies to Create (11 Total) +# MAGIC 1. **pci_clearance** - PCI-DSS access levels for payment card data +# MAGIC 2. **payment_role** - Payment processing roles +# MAGIC 3. **aml_clearance** - AML investigation clearance levels +# MAGIC 4. **trading_desk** - Trading desk assignment +# MAGIC 5. **information_barrier** - Chinese wall classification +# MAGIC 6. **data_residency** - Geographic data residency requirements +# MAGIC 7. **customer_region** - Customer data geographic location +# MAGIC 8. **market_hours** - Trading hours access control +# MAGIC 9. **audit_project** - Specific audit project identification +# MAGIC 10. **pii_level** - Personal information access classification +# MAGIC 11. **sox_scope** - SOX audit scope classification + +# COMMAND ---------- + +# Import required libraries +import requests +import json +from typing import List, Dict, Any + +# COMMAND ---------- + +# Configuration - Update these values for your environment +workspace_url = "https://dbc-0f56e540-7f65.cloud.databricks.com" # Update with your workspace URL + +# Get token from Databricks secrets or environment +# Option 1: From dbutils (if running in Databricks) +try: + token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() + print("āœ… Token retrieved from Databricks context") +except: + print("āœ… Token can't be retrieved from configuration") + +# Setup API headers and base URL +headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json" +} +base_url = f"{workspace_url}/api/2.0/tag-policies" +print(f"🌐 Base URL: {base_url}") + +# COMMAND ---------- + +# Utility function to create tag policy +def create_tag_policy(tag_key: str, allowed_values: List[str], description: str) -> Dict[str, Any]: + """ + Create a Unity Catalog tag policy using REST API + + Args: + tag_key: The tag key name (case sensitive) + allowed_values: List of allowed values for this tag + description: Description of the tag policy + + Returns: + API response as dictionary + """ + + # First, try to delete existing tag policy (if exists) + delete_url = f"{base_url}/{tag_key}" + try: + delete_response = requests.delete(delete_url, headers=headers) + if delete_response.status_code == 200: + print(f"šŸ—‘ļø Deleted existing tag policy: {tag_key}") + except Exception as e: + print(f"ā„¹ļø No existing tag policy to delete for: {tag_key}") + + # Create the tag policy payload + create_payload = { + "tag_policy": { + "key": tag_key, + "values": [{"name": value} for value in allowed_values], + "description": description + } + } + + # Make the API call to create tag policy + try: + create_response = requests.post(base_url, headers=headers, data=json.dumps(create_payload)) + + if create_response.status_code == 200: + print(f"āœ… Successfully created tag policy: {tag_key}") + print(f" šŸ“ Description: {description}") + print(f" šŸ·ļø Allowed values ({len(allowed_values)}): {', '.join(allowed_values[:5])}{'...' if len(allowed_values) > 5 else ''}") + return {"success": True, "response": create_response.json()} + else: + print(f"āŒ Failed to create tag policy: {tag_key}") + print(f" Status Code: {create_response.status_code}") + print(f" Response: {create_response.text}") + return {"success": False, "error": create_response.text} + + except Exception as e: + print(f"āŒ Exception creating tag policy {tag_key}: {str(e)}") + return {"success": False, "error": str(e)} + +# COMMAND ---------- + +# Define all finance tag policies +finance_tag_policies = { + "pci_clearance": { + "values": [ + "Basic", + "Standard", + "Full", + "Administrative" + ], + "description": "PCI-DSS access levels: Basic=last4, Standard=masked, Full=complete card data, Administrative=all cardholder data" + }, + + "payment_role": { + "values": [ + "Customer_Service", + "Fraud_Analyst", + "Compliance_Officer", + "Payment_Processor" + ], + "description": "Payment processing roles for PCI-DSS access control" + }, + + "aml_clearance": { + "values": [ + "Junior_Analyst", + "Senior_Investigator", + "Compliance_Officer", + "FinCEN_Reporter" + ], + "description": "AML investigation clearance levels for progressive data access (AML/KYC, FATF compliance)" + }, + + "trading_desk": { + "values": [ + "Equity", + "Fixed_Income", + "FX", + "Commodities", + "Research", + "Risk_Management" + ], + "description": "Trading desk assignment for position data access control" + }, + + "information_barrier": { + "values": [ + "Trading_Side", + "Advisory_Side", + "Neutral" + ], + "description": "Chinese wall information barrier classification (SEC, MiFID II compliance)" + }, + + "data_residency": { + "values": [ + "EU", + "US", + "APAC", + "LATAM", + "Global" + ], + "description": "Geographic data residency requirements for GDPR, CCPA, PDPA compliance" + }, + + "customer_region": { + "values": [ + "EU", + "US", + "APAC", + "LATAM" + ], + "description": "Customer data geographic location for regional data access control" + }, + + "market_hours": { + "values": [ + "Trading_Hours", + "After_Hours", + "Weekend", + "24x7" + ], + "description": "Market hours-based access control for trading positions (prevent manipulation during trading)" + }, + + "audit_project": { + "values": [ + "Q1_SOX_Audit", + "Q2_SOX_Audit", + "Q3_SOX_Audit", + "Q4_SOX_Audit", + "Annual_Financial_Audit", + "Regulatory_Review", + "Internal_Audit" + ], + "description": "Specific audit project identification for temporary access control (SOX compliance)" + }, + + "pii_level": { + "values": [ + "Full_PII", + "Limited_PII", + "De_Identified", + "Statistical_Only" + ], + "description": "Personal information access classification for GDPR, GLBA, CCPA privacy compliance" + }, + + "sox_scope": { + "values": [ + "In_Scope", + "Out_Of_Scope" + ], + "description": "SOX audit scope classification for financial reporting controls" + } +} + +print(f"šŸ“Š Prepared {len(finance_tag_policies)} finance tag policies for creation") + +# COMMAND ---------- + +import time + +# Create all finance tag policies +print("šŸš€ Starting finance tag policy creation...\n") + +results = {} +success_count = 0 +failure_count = 0 + +for tag_key, config in finance_tag_policies.items(): + print(f"\n{'='*60}") + print(f"Creating tag policy: {tag_key}") + print(f"{'='*60}") + + result = create_tag_policy( + tag_key=tag_key, + allowed_values=config["values"], + description=config["description"] + ) + + results[tag_key] = result + + if result["success"]: + success_count += 1 + else: + failure_count += 1 + + print("\n") + time.sleep(1.5) +print(f"\n{'='*60}") +print("šŸ“Š CREATION SUMMARY") +print(f"{'='*60}") +print(f"āœ… Successful: {success_count}") +print(f"āŒ Failed: {failure_count}") +print(f"šŸ“Š Total: {len(finance_tag_policies)}") + +# COMMAND ---------- + +# List all created tag policies for verification +print("šŸ” Verifying created tag policies...\n") + +try: + list_response = requests.get(base_url, headers=headers) + + if list_response.status_code == 200: + policies = list_response.json() + + print(f"šŸ“‹ Found {len(policies.get('tag_policies', []))} tag policies in Unity Catalog:") + print("\n" + "="*80) + + finance_policies = [] + for policy in policies.get('tag_policies', []): + key = policy.get('key', 'Unknown') + description = policy.get('description', 'No description') + values = [v.get('name', '') for v in policy.get('values', [])] + + # Check if this is one of our finance policies + if key in finance_tag_policies: + finance_policies.append(key) + print(f"šŸ¦ {key}") + print(f" šŸ“ Description: {description}") + print(f" šŸ·ļø Values ({len(values)}): {', '.join(values[:5])}{'...' if len(values) > 5 else ''}") + print() + + print(f"\nāœ… Finance tag policies found: {len(finance_policies)}/{len(finance_tag_policies)}") + + if len(finance_policies) == len(finance_tag_policies): + print("šŸŽ‰ All finance tag policies created successfully!") + else: + missing = set(finance_tag_policies.keys()) - set(finance_policies) + print(f"āš ļø Missing policies: {missing}") + + else: + print(f"āŒ Failed to list tag policies. Status: {list_response.status_code}") + print(f"Response: {list_response.text}") + +except Exception as e: + print(f"āŒ Exception while listing tag policies: {str(e)}") + +# COMMAND ---------- + +# Generate sample tag application SQL for reference +print("šŸ“‹ Sample SQL for applying tags to finance tables:\n") + +sample_sql = ''' +-- Use the finance catalog and schema +USE CATALOG fincat; +USE SCHEMA finance; + +-- Example: Apply PCI-DSS tags to CreditCards table +ALTER TABLE CreditCards +SET TAGS ( + 'pci_clearance' = 'Full', + 'payment_role' = 'Fraud_Analyst' +); + +-- Example: Apply PCI tags to sensitive card columns +ALTER TABLE CreditCards ALTER COLUMN CardNumber +SET TAGS ( + 'pci_clearance' = 'Full', + 'payment_role' = 'Fraud_Analyst' +); + +ALTER TABLE CreditCards ALTER COLUMN CVV +SET TAGS ( + 'pci_clearance' = 'Administrative' +); + +-- Example: Apply AML tags to Transactions table +ALTER TABLE Transactions +SET TAGS ( + 'aml_clearance' = 'Senior_Investigator' +); + +-- Example: Apply Chinese wall tags to TradingPositions +ALTER TABLE TradingPositions +SET TAGS ( + 'trading_desk' = 'Equity', + 'information_barrier' = 'Trading_Side' +); + +-- Example: Apply data residency tags to Customers +ALTER TABLE Customers +SET TAGS ( + 'data_residency' = 'Global', + 'pii_level' = 'Full_PII' +); + +ALTER TABLE Customers ALTER COLUMN CustomerRegion +SET TAGS ( + 'customer_region' = 'EU' +); + +-- Verify tag assignments +SELECT table_name, tag_name, tag_value +FROM system.information_schema.table_tags +WHERE schema_name = 'finance' +ORDER BY table_name, tag_name; +''' + +print(sample_sql) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## šŸŽÆ Next Steps +# MAGIC +# MAGIC After running this notebook successfully: +# MAGIC +# MAGIC 1. **Verify tag policies** are created in Databricks Account Console → Data → Tag Policies +# MAGIC 2. **Apply tags to tables** using `3.ApplyFinanceSetTags.sql` +# MAGIC 3. **Create ABAC policies** using `4.CreateFinanceABACPolicies.sql` +# MAGIC 4. **Test access control** with different user personas and tag assignments +# MAGIC +# MAGIC ## šŸ“š Tag Policy Summary +# MAGIC +# MAGIC ### Payment & Card Security (PCI-DSS) +# MAGIC - `pci_clearance` - 4 levels from Basic to Administrative +# MAGIC - `payment_role` - Payment processing team roles +# MAGIC +# MAGIC ### AML & Compliance +# MAGIC - `aml_clearance` - Progressive AML investigation access +# MAGIC - `sox_scope` - SOX audit scope classification +# MAGIC - `audit_project` - Temporary auditor access projects +# MAGIC +# MAGIC ### Trading & Markets +# MAGIC - `trading_desk` - Trading desk assignments +# MAGIC - `information_barrier` - Chinese wall enforcement +# MAGIC - `market_hours` - Time-based trading access +# MAGIC +# MAGIC ### Privacy & Residency +# MAGIC - `pii_level` - Personal information classification +# MAGIC - `data_residency` - Geographic data hosting requirements +# MAGIC - `customer_region` - Customer data geographic location +# MAGIC +# MAGIC ## šŸ¦ Finance ABAC Demo Ready! +# MAGIC +# MAGIC Your Unity Catalog is now equipped with comprehensive tag policies for enterprise financial services data governance! šŸŽ‰ diff --git a/uc-quickstart/utils/abac/finance/3.ApplyFinanceSetTags.sql b/uc-quickstart/utils/abac/finance/3.ApplyFinanceSetTags.sql new file mode 100644 index 00000000..f35497d2 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/3.ApplyFinanceSetTags.sql @@ -0,0 +1,85 @@ +-- ============================================= +-- APPLY FINANCE ABAC TAGS (Minimal 5 Scenarios) +-- Purpose: Tag tables/columns for 5 ABAC scenarios only +-- Tables: Customers, CreditCards, Transactions, Accounts +-- ============================================= + +USE CATALOG fincat; +USE SCHEMA finance; + +-- ============================================= +-- SCENARIO 1: PII MASKING (Customers) +-- Junior: masked; Senior + Compliance: unmasked +-- ============================================= + +ALTER TABLE Customers +SET TAGS ( + 'data_residency' = 'Global', + 'pii_level' = 'Full_PII' +); + +ALTER TABLE Customers ALTER COLUMN CustomerRegion +SET TAGS ( + 'customer_region' = 'EU', + 'data_residency' = 'EU' +); + +ALTER TABLE Customers ALTER COLUMN SSN +SET TAGS ( + 'pii_level' = 'Full_PII', + 'data_residency' = 'US' +); + +ALTER TABLE Customers ALTER COLUMN FirstName SET TAGS ('pii_level' = 'Limited_PII'); +ALTER TABLE Customers ALTER COLUMN LastName SET TAGS ('pii_level' = 'Limited_PII'); +ALTER TABLE Customers ALTER COLUMN Email SET TAGS ('pii_level' = 'Limited_PII'); + +SELECT 'āœ… SCENARIO 1: PII and region tags applied to Customers' as status; + +-- ============================================= +-- SCENARIO 2: FRAUD / CARD (CreditCards) +-- Junior: last-4 only; Senior: full card; Compliance: full + CVV +-- ============================================= + +ALTER TABLE CreditCards SET TAGS ('pci_clearance' = 'Full'); + +ALTER TABLE CreditCards ALTER COLUMN CardNumber SET TAGS ('pci_clearance' = 'Full'); +ALTER TABLE CreditCards ALTER COLUMN CVV SET TAGS ('pci_clearance' = 'Administrative'); + +SELECT 'āœ… SCENARIO 2: PCI tags applied to CreditCards' as status; + +-- ============================================= +-- SCENARIO 3: FRAUD / TRANSACTIONS (Amount rounding) +-- Junior: rounded amounts; Senior + Compliance: full +-- ============================================= + +ALTER TABLE Transactions SET TAGS ('aml_clearance' = 'Senior_Investigator'); + +ALTER TABLE Transactions ALTER COLUMN Amount SET TAGS ('aml_clearance' = 'Junior_Analyst'); + +SELECT 'āœ… SCENARIO 3: AML tags applied to Transactions' as status; + +-- ============================================= +-- SCENARIOS 4 & 5: REGIONAL ROW FILTERS (US / EU) +-- Tag tables so row filter policies apply (filter functions restrict by row) +-- ============================================= + +-- Customers table: in scope for regional policies (US_Region_Staff -> US rows; EU_Region_Staff -> EU rows) +ALTER TABLE Customers SET TAGS ('customer_region' = 'Regional', 'data_residency' = 'Global'); + +-- Accounts: optional for regional demo +ALTER TABLE Accounts SET TAGS ('data_residency' = 'Global', 'customer_region' = 'Regional'); + +SELECT 'āœ… SCENARIOS 4 & 5: Region tags applied for US/EU row filters' as status; + +-- ============================================= +-- VERIFICATION +-- ============================================= + +SELECT table_name, tag_name, tag_value +FROM system.information_schema.table_tags +WHERE schema_name = 'finance' +ORDER BY table_name, tag_name; + +SELECT 'āœ… Minimal finance ABAC tags applied (5 scenarios)' as status; +SELECT 'šŸ” Next: 4.CreateFinanceABACPolicies.sql' as next_step; diff --git a/uc-quickstart/utils/abac/finance/4.CreateFinanceABACPolicies.sql b/uc-quickstart/utils/abac/finance/4.CreateFinanceABACPolicies.sql new file mode 100644 index 00000000..4f3a8eb8 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/4.CreateFinanceABACPolicies.sql @@ -0,0 +1,157 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC # Finance ABAC Policies - Minimal 5 Scenarios +-- MAGIC +-- MAGIC Catalog-level ABAC policies for the minimal finance demo (5 groups, 5 scenarios). +-- MAGIC +-- MAGIC ## Prerequisites +-- MAGIC - Unity Catalog enabled with ABAC +-- MAGIC - Tag policies created (Terraform or 2.CreateFinanceTagPolicies.py) +-- MAGIC - 5 groups created (Terraform: Junior_Analyst, Senior_Analyst, US_Region_Staff, EU_Region_Staff, Compliance_Officer) +-- MAGIC - Tables tagged (3.ApplyFinanceSetTags.sql) +-- MAGIC - ABAC functions deployed (0.1finance_abac_functions.sql) +-- MAGIC +-- MAGIC ## 5 Scenarios +-- MAGIC 1. PII masking (Customers) - Junior masked, Senior + Compliance unmasked +-- MAGIC 2. Fraud / card (CreditCards) - Junior last-4, Senior full card, Compliance full+CVV +-- MAGIC 3. Fraud / transactions (Transactions) - Junior rounded amount, Senior + Compliance full +-- MAGIC 4. US region - US_Region_Staff row filter +-- MAGIC 5. EU region - EU_Region_Staff row filter + +-- COMMAND ---------- + +USE CATALOG fincat; +SHOW FUNCTIONS IN fincat.finance LIKE 'mask*'; +SHOW FUNCTIONS IN fincat.finance LIKE 'filter*'; +SELECT "Ready to create catalog-level ABAC policies (5 scenarios)" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## POLICY 1: PII Masking (Customers) +-- MAGIC Junior_Analyst: mask_pii_partial on Limited_PII columns, mask_ssn on SSN. Senior_Analyst and Compliance_Officer: unmasked. + +-- COMMAND ---------- + +CREATE OR REPLACE POLICY fincat_pii_junior_mask +ON CATALOG fincat +COMMENT 'PII: Mask names and email for junior analysts' +COLUMN MASK fincat.finance.mask_pii_partial +TO `Junior_Analyst` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_level', 'Limited_PII') AS pii_cols +ON COLUMN pii_cols; + +CREATE OR REPLACE POLICY fincat_pii_junior_ssn +ON CATALOG fincat +COMMENT 'PII: Mask SSN for junior analysts' +COLUMN MASK fincat.finance.mask_ssn +TO `Junior_Analyst` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_level', 'Full_PII') AND hasTagValue('data_residency', 'US') AS ssn_cols +ON COLUMN ssn_cols; + +SELECT "POLICY 1: PII masking policies created" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## POLICY 2: Fraud / Card (CreditCards) +-- MAGIC Junior_Analyst: last-4 only. Senior_Analyst: full card (CVV masked). Compliance_Officer: full card + CVV. + +-- COMMAND ---------- + +CREATE OR REPLACE POLICY fincat_pci_junior_last4 +ON CATALOG fincat +COMMENT 'Card: Last 4 digits only for junior analysts' +COLUMN MASK fincat.finance.mask_credit_card_last4 +TO `Junior_Analyst` +FOR TABLES +MATCH COLUMNS hasTagValue('pci_clearance', 'Full') AS card_cols +ON COLUMN card_cols; + +CREATE OR REPLACE POLICY fincat_pci_cvv_mask_except_compliance +ON CATALOG fincat +COMMENT 'Card: Mask CVV for all except Compliance_Officer' +COLUMN MASK fincat.finance.mask_credit_card_full +TO `account users` +EXCEPT `Compliance_Officer` +FOR TABLES +MATCH COLUMNS hasTagValue('pci_clearance', 'Administrative') AS cvv_cols +ON COLUMN cvv_cols; + +SELECT "POLICY 2: Fraud/card policies created" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## POLICY 3: Fraud / Transactions (Amount rounding) +-- MAGIC Junior_Analyst: rounded amounts. Senior_Analyst and Compliance_Officer: full. + +-- COMMAND ---------- + +CREATE OR REPLACE POLICY fincat_aml_junior_round +ON CATALOG fincat +COMMENT 'Transactions: Round amount for junior analysts' +COLUMN MASK fincat.finance.mask_amount_rounded +TO `Junior_Analyst` +FOR TABLES +MATCH COLUMNS hasTagValue('aml_clearance', 'Junior_Analyst') AS aml_cols +ON COLUMN aml_cols; + +SELECT "POLICY 3: Fraud/transactions policy created" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## POLICY 4: US Region (Row filter for US_Region_Staff) +-- MAGIC Tables tagged customer_region = 'Regional' get row filter for US staff. + +-- COMMAND ---------- + +CREATE OR REPLACE POLICY fincat_region_us +ON CATALOG fincat +COMMENT 'Region: US staff see US customer data only' +ROW FILTER fincat.finance.filter_by_region_us +TO `US_Region_Staff` +FOR TABLES +WHEN hasTagValue('customer_region', 'Regional'); + +SELECT "POLICY 4: US region policy created" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## POLICY 5: EU Region (Row filter for EU_Region_Staff) +-- MAGIC Tables tagged customer_region = 'Regional' get row filter for EU staff. + +-- COMMAND ---------- + +CREATE OR REPLACE POLICY fincat_region_eu +ON CATALOG fincat +COMMENT 'Region: EU staff see EU customer data only' +ROW FILTER fincat.finance.filter_by_region_eu +TO `EU_Region_Staff` +FOR TABLES +WHEN hasTagValue('customer_region', 'Regional'); + +SELECT "POLICY 5: EU region policy created" as status; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Verification + +-- COMMAND ---------- + +SHOW POLICIES ON CATALOG fincat; + +SELECT 'Policy Summary' as section, '5 scenarios' as status +UNION ALL SELECT 'Scenario 1', 'PII masking (2 policies)' +UNION ALL SELECT 'Scenario 2', 'Fraud/card (2 policies)' +UNION ALL SELECT 'Scenario 3', 'Fraud/transactions (1 policy)' +UNION ALL SELECT 'Scenario 4', 'US region (1 policy)' +UNION ALL SELECT 'Scenario 5', 'EU region (1 policy)'; + +SELECT "All 7 finance ABAC policies created (minimal demo)" as status; +SELECT "Next: 5.TestFinanceABACPolicies.sql" as next_step; diff --git a/uc-quickstart/utils/abac/finance/5.TestFinanceABACPolicies.sql b/uc-quickstart/utils/abac/finance/5.TestFinanceABACPolicies.sql new file mode 100644 index 00000000..369b458e --- /dev/null +++ b/uc-quickstart/utils/abac/finance/5.TestFinanceABACPolicies.sql @@ -0,0 +1,113 @@ +-- ============================================= +-- FINANCE ABAC - TEST QUERIES (Minimal 5 Scenarios) +-- Run as different user groups to validate masking and row filters +-- Groups: Junior_Analyst, Senior_Analyst, US_Region_Staff, EU_Region_Staff, Compliance_Officer +-- ============================================= + +USE CATALOG fincat; +USE SCHEMA finance; + +-- ============================================= +-- TEST 1: PII MASKING (Customers) +-- Test as: Junior_Analyst (masked), Senior_Analyst (unmasked), Compliance_Officer (unmasked) +-- ============================================= + +SELECT '========================================' as divider; +SELECT 'TEST 1: PII Masking (Customers)' as test_name; +SELECT '========================================' as divider; + +SELECT + CustomerID, + FirstName, + LastName, + Email, + SSN, + CustomerRegion +FROM Customers +LIMIT 5; + +-- Expected: Junior_Analyst -> masked FirstName, LastName, Email, SSN (e.g. ***). Senior + Compliance -> full values. +SELECT 'Test 1 complete: Check PII masking for your role' as result; + +-- ============================================= +-- TEST 2: FRAUD / CARD (CreditCards) +-- Test as: Junior_Analyst (last-4), Senior_Analyst (full card, CVV masked), Compliance_Officer (full + CVV) +-- ============================================= + +SELECT '========================================' as divider; +SELECT 'TEST 2: Fraud / Card (CreditCards)' as test_name; +SELECT '========================================' as divider; + +SELECT + CardID, + CustomerID, + CardNumber, + CVV, + CardType, + ExpirationDate +FROM CreditCards +LIMIT 5; + +-- Expected: Junior -> XXXX-XXXX-XXXX-1234, CVV masked. Senior -> full CardNumber, CVV masked. Compliance -> full CardNumber + CVV. +SELECT 'Test 2 complete: Check card masking for your role' as result; + +-- ============================================= +-- TEST 3: FRAUD / TRANSACTIONS (Amount) +-- Test as: Junior_Analyst (rounded), Senior_Analyst (full), Compliance_Officer (full) +-- ============================================= + +SELECT '========================================' as divider; +SELECT 'TEST 3: Fraud / Transactions (Amount)' as test_name; +SELECT '========================================' as divider; + +SELECT + TransactionID, + AccountID, + TransactionDate, + Amount, + TransactionType, + TransactionStatus +FROM Transactions +ORDER BY TransactionDate DESC +LIMIT 10; + +-- Expected: Junior -> Amount rounded (e.g. 1200.00). Senior + Compliance -> exact Amount. +SELECT 'Test 3 complete: Check transaction amount for your role' as result; + +-- ============================================= +-- TEST 4: US REGION (Row filter) +-- Test as: US_Region_Staff (should see only CustomerRegion = 'US' rows) +-- ============================================= + +SELECT '========================================' as divider; +SELECT 'TEST 4: US Region (US_Region_Staff)' as test_name; +SELECT '========================================' as divider; + +SELECT CustomerID, FirstName, LastName, CustomerRegion +FROM Customers +ORDER BY CustomerRegion; + +-- Expected when run as US_Region_Staff: Only rows where CustomerRegion = 'US'. Other roles may see all regions. +SELECT 'Test 4 complete: US_Region_Staff should see only US rows' as result; + +-- ============================================= +-- TEST 5: EU REGION (Row filter) +-- Test as: EU_Region_Staff (should see only CustomerRegion = 'EU' rows) +-- ============================================= + +SELECT '========================================' as divider; +SELECT 'TEST 5: EU Region (EU_Region_Staff)' as test_name; +SELECT '========================================' as divider; + +SELECT CustomerID, FirstName, LastName, CustomerRegion +FROM Customers +ORDER BY CustomerRegion; + +-- Expected when run as EU_Region_Staff: Only rows where CustomerRegion = 'EU'. Other roles may see all regions. +SELECT 'Test 5 complete: EU_Region_Staff should see only EU rows' as result; + +-- ============================================= +-- SUMMARY +-- ============================================= + +SELECT 'Minimal 5-scenario tests complete. Run as Junior_Analyst, Senior_Analyst, US_Region_Staff, EU_Region_Staff, Compliance_Officer to validate.' as summary; diff --git a/uc-quickstart/utils/abac/finance/ABAC_FINANCE_Demo_Plan.md b/uc-quickstart/utils/abac/finance/ABAC_FINANCE_Demo_Plan.md new file mode 100644 index 00000000..8624de12 --- /dev/null +++ b/uc-quickstart/utils/abac/finance/ABAC_FINANCE_Demo_Plan.md @@ -0,0 +1,551 @@ + + +# šŸŽÆ ABAC in Financial Services: Field Tips and Demo Mastery + +## šŸŽŖ The Art of Finance ABAC Demonstrations + +**Theme**: "Real-World Field Tricks for Winning Financial Services ABAC Demonstrations" + +**Mission**: Transform technical ABAC features into compelling business stories that resonate with financial services decision-makers through battle-tested demo techniques. + +> **šŸ“ Note**: This guide focuses specifically on **Financial Services** use cases, covering banking, payments, trading, and compliance. This complements our healthcare ABAC framework and demonstrates the versatility of attribute-based access control across industries. + +--- + +## Minimal 5-Group Demo (Quick Version) + +For a **short demo**, use only **5 groups** and **5 scenarios**: (1) **PII masking** – run the same `SELECT` on Customers as Junior_Analyst (masked) vs Senior_Analyst or Compliance_Officer (unmasked). (2) **Fraud/card** – run the same `SELECT` on CreditCards as Junior (last-4 only), Senior (full card), Compliance (full + CVV). (3) **Fraud/transactions** – run the same `SELECT` on Transactions as Junior (rounded Amount) vs Senior/Compliance (full Amount). (4) **US region** – run `SELECT` on Customers as US_Region_Staff (only US rows). (5) **EU region** – run the same as EU_Region_Staff (only EU rows). **Compliance_Officer** sees everything (all regions, unmasked). Setup: Terraform in `genie/aws` creates the 5 groups and tag policies; then run the SQL notebooks in order (functions → schema → tags → ABAC policies). Test with `5.TestFinanceABACPolicies.sql`. + +--- + +## 🧠 The Psychology of Financial Services Demos + +### **The Financial Services Mindset** +- **Heavily Regulated**: "What regulators will audit matters more than what's convenient" +- **Risk-First**: "Show me what could go wrong before showing me what works" +- **Cost-Conscious**: "Compliance costs money, but non-compliance costs more" +- **Speed-Obsessed**: "Markets move in milliseconds, compliance can't slow us down" + +### **Demo Success Formula** +> **Trust + Proof + ROI = Decision** + +- **Trust**: Demonstrate you understand their regulatory burden +- **Proof**: Show real-world scenarios they face daily +- **ROI**: Quantify cost savings and risk reduction in dollars + +--- + +## šŸŽ­ Field Trick #1: The "3 AM Fraud Alert" Opening + +### **The Setup** (60 seconds) +Instead of starting with technology, start with their reality: + +> *"It's 3 AM Saturday. Your fraud detection system flags 10,000 accounts with suspicious charges. Your fraud analyst needs IMMEDIATE access to full card numbers to verify with card issuers and stop the bleeding. But your PCI-DSS compliance officer wakes up in a cold sweat - giving anyone access to full PANs violates your security policies. Meanwhile, customer service is getting hammered with calls, but they can only see the last 4 digits. This exact scenario cost Capital One $80 million in their 2019 breach. How do you balance security with operational urgency?"* + +### **The Payoff** +- **Immediate Connection**: They've lived this nightmare +- **Regulatory Hook**: PCI-DSS is their reality +- **Cost Urgency**: Real breach costs, not theoretical + +### **Field Trick**: Always start with a breach or audit failure story - financial services knows the cost of getting it wrong. + +--- + +## šŸŽ­ Field Trick #2: The "Same Query, Different Universe" Magic + +### **The Setup** +Show the exact same SQL query executed by three different roles, revealing completely different card data. + +```sql +-- The "Magic Query" - identical for all users +SELECT + CardID, + CustomerID, + CardNumber, + CVV, + ExpirationDate, + CardType +FROM fincat.finance.CreditCards +LIMIT 3; +``` + +### **The Reveal** (This is where jaws drop) + +**Customer Service Rep sees:** +``` +CARD0001 | REF_c8a9f... | XXXX-XXXX-XXXX-9010 | XXXX-XXXX-XXXX-XXXX | 12/2026 | Visa +CARD0002 | REF_2b771... | XXXX-XXXX-XXXX-0123 | XXXX-XXXX-XXXX-XXXX | 06/2025 | Mastercard +CARD0003 | REF_40c7a... | XXXX-XXXX-XXXX-1234 | XXXX-XXXX-XXXX-XXXX | 09/2027 | Amex +``` + +**Fraud Analyst sees:** +``` +CARD0001 | CUST00001 | 4532-1234-5678-9010 | XXX | 12/2026 | Visa +CARD0002 | CUST00002 | 5425-2345-6789-0123 | XXX | 06/2025 | Mastercard +CARD0003 | CUST00003 | 3782-456789-01234 | XXX | 09/2027 | Amex +``` + +**Compliance Officer sees:** +``` +CARD0001 | CUST00001 | 4532-1234-5678-9010 | 123 | 12/2026 | Visa +CARD0002 | CUST00002 | 5425-2345-6789-0123 | 456 | 06/2025 | Mastercard +CARD0003 | CUST00003 | 3782-456789-01234 | 789 | 09/2027 | Amex +``` + +### **The Magic Moment** +> *"Same query. Same database. Same moment in time. But three completely different views of reality based on PCI-DSS clearance levels. Customer service can verify the last 4 digits with customers. Fraud analysts can call card issuers with full PANs. Compliance can audit everything including CVVs. This is ABAC preventing your next breach while enabling your operations."* + +### **Field Trick**: Practice this reveal timing - the pause after running the query builds suspense. This is your showstopper moment. + +--- + +## šŸŽ­ Field Trick #3: The "Chinese Wall Proof" + +### **The Setup** +Show how research analysts are completely blocked from seeing trading positions - the digital Chinese wall. + +### **The Script** +> *"Investment banks live in fear of the SEC. One research analyst seeing insider trading data? That's a $50 million fine and front-page scandal. Watch what happens when our research analyst tries to view trading positions..."* + +```sql +-- Research analyst attempting to view trading positions +SELECT + PositionID, + SecurityName, + TradingDesk, + PnL +FROM fincat.finance.TradingPositions +ORDER BY PnL DESC +LIMIT 10; +``` + +### **The Magic Result** +```sql +-- Research analyst sees: +(0 rows returned) + +-- Meanwhile, Equity trader sees: +POS00001 | Apple Inc | Equity | $25,250.00 +POS00002 | Alphabet Inc | Equity | $75,375.00 +... + +-- Risk manager (neutral) sees: +(All positions across all desks) +``` + +### **The Revelation** +> *"Notice what didn't happen - no error message, no 'Access Denied' popup. The trading data simply doesn't exist in the research analyst's universe. They can't accidentally stumble on it, can't screenshot it, can't leak it. The Chinese wall is enforced at the data layer, not by trust or policy documents. This is how you sleep at night while the SEC watches."* + +### **Field Trick**: Show the "0 rows returned" - the invisible protection is more powerful than error messages. + +--- + +## šŸŽ­ Field Trick #4: The "AML Investigation Escalation" + +### **The Setup** +Show progressive access to transaction data as AML investigations escalate. + +### **The Script** +> *"Your AML team monitors thousands of transactions daily. Junior analysts look for patterns. But you can't give them full customer PII - GDPR violations. Watch how access expands as an investigation escalates..."* + +**Junior Analyst Query:** +```sql +SELECT + TransactionID, + Amount, + TransactionType, + CountryCode +FROM fincat.finance.Transactions +WHERE Amount > 10000 +LIMIT 10; +``` + +**Junior sees:** +``` +TXN000003 | 15,000.00 | Withdrawal | US -- Amount rounded +TXN000004 | 8,500.00 | Transfer | DE -- Aggregated +TXN000006 | 45,000.00 | Deposit | BR -- Pattern visible but no PII +``` + +**Senior Investigator sees:** +``` +TXN000003 | 15,234.50 | Withdrawal | US | Cash Withdrawal ATM -- Full details +TXN000004 | 8,567.20 | Transfer | DE | International Wire -- Customer linkable +TXN000006 | 45,123.89 | Deposit | BR | Large Cash Deposit -- Investigation notes visible +``` + +### **The Business Impact** +> *"Junior analysts can spot patterns across thousands of transactions without accessing PII - GDPR compliant. When they escalate a case, senior investigators automatically get the customer details needed for FinCEN reports. Your AML team moves faster while your privacy team sleeps better."* + +### **Field Trick**: Show the progression - same data, different detail levels. This demonstrates "need to know" in action. + +--- + +## šŸŽ­ Field Trick #5: The "GDPR Geographic Lockdown" + +### **The Setup** +Show how EU customer data stays in the EU, blocking access from US staff. + +### **The Script** +> *"Your bank operates in 47 countries. GDPR says EU customer data can't leave the EU without explicit consent. CCPA has different rules for California. PDPA covers Singapore. How do you enforce all this without building 47 separate databases? Watch..."* + +```sql +-- US Staff trying to view all customers +SELECT + CustomerID, + FirstName, + LastName, + CustomerRegion +FROM fincat.finance.Customers +ORDER BY CustomerRegion; +``` + +**US Staff sees:** +``` +CUST00001 | John | Smith | US +CUST00002 | Maria | Garcia | US +CUST00006 | Sarah | Johnson | US +-- Only US customers visible, EU/APAC completely invisible +``` + +**EU Staff sees:** +``` +CUST00003 | Hans | Mueller | EU +CUST00004 | Sophie | Dubois | EU +CUST00009 | Emma | Wilson | EU +-- Only EU customers visible +``` + +**Compliance Officer (Global) sees:** +``` +(All customers from all regions - global oversight) +``` + +### **The Revelation** +> *"Your US staff literally cannot see EU customer data. Not 'shouldn't' - CANNOT. If they run a query, EU records don't appear. If they try to join tables, EU transactions are filtered out. The data residency rules are enforced at the database level, not by training or policy. This is GDPR by design, not by compliance memo."* + +### **Field Trick**: Show the row count by region - US staff see 3 customers, EU staff see 3 different customers, compliance sees all 10. + +--- + +## šŸŽ­ Field Trick #6: The "Trading Hours Lockout" + +### **The Setup** +Show how risk managers are blocked from viewing positions during market hours to prevent manipulation. + +### **The Script** +> *"Your risk manager needs to monitor trader P&L. But if they can see live positions during trading hours, they might interfere - 'Close that losing position now!' That's market manipulation. SEC fines start at $1 million. Watch what happens when risk tries to view positions at..."* + +```sql +-- Check current market status +SELECT + CURRENT_TIMESTAMP() as now, + CASE + WHEN HOUR(CURRENT_TIMESTAMP()) BETWEEN 14 AND 20 + THEN 'TRADING HOURS (9:30 AM - 4:00 PM ET)' + ELSE 'AFTER HOURS' + END as market_status; +``` + +**During Trading Hours (2:30 PM ET / 19:30 UTC):** +``` +2026-01-26T19:30:00 | TRADING HOURS (9:30 AM - 4:00 PM ET) + +-- Risk manager queries positions: +SELECT * FROM fincat.finance.TradingPositions; + +Result: 0 rows returned (blocked during trading) +``` + +**After Hours (6:00 PM ET / 23:00 UTC):** +``` +2026-01-26T23:00:00 | AFTER HOURS + +-- Same query now returns data: +POS00001 | AAPL | Equity | $25,250.00 | ... +POS00002 | GOOGL | Equity | $75,375.00 | ... +(Full access to all positions and P&L) +``` + +### **The Magic** +> *"At 4:00 PM when markets close, risk managers automatically gain access. At 9:30 AM when markets open, access disappears. No manual enabling, no forgotten permissions. The system knows what time it is and enforces clean separation. Your traders trade free from interference, your risk team reviews everything after hours."* + +### **Field Trick**: If possible, demonstrate this live by showing the actual time. If demo is after hours, show the code logic and explain the behavior. + +--- + +## šŸŽ­ Field Trick #7: The "Temporary Auditor Expiration" + +### **The Setup** +Show how external auditors get automatic time-limited access that expires without IT intervention. + +### **The Script** +> *"It's SOX audit season. External auditors need access to financial records for Q1 review. Your IT team creates accounts, grants permissions... then forgets to revoke them six months later. That's how auditors become permanent backdoors. Watch this instead..."* + +```sql +-- External auditor queries accounts +SELECT + AccountID, + Balance, + AccountType, + 'Q1 SOX Audit' as audit_scope, + '2026-03-31' as access_expires +FROM fincat.finance.Accounts +WHERE AccountID IN (SELECT AccountID FROM fincat.finance.AuditLogs WHERE AuditProject = 'Q1_SOX_Audit') +LIMIT 5; +``` + +**Before March 31, 2026:** +``` +ACC1001 | $15,234.50 | Checking | Q1 SOX Audit | 2026-03-31 +ACC1002 | $45,678.90 | Savings | Q1 SOX Audit | 2026-03-31 +(Full access to in-scope accounts) +``` + +**On April 1, 2026:** +``` +(0 rows returned - access automatically expired) +``` + +### **The Business Impact** +> *"On March 31st at midnight, the auditor's access disappears. No IT ticket, no manual revocation, no forgotten credentials. The ABAC policy checks the expiration date on every query. Your SOX audit happened, they got their data, and their access self-destructed. Your attack surface just shrunk automatically."* + +### **Field Trick**: Show the expiration date in the data itself - makes it tangible and visible. + +--- + +## šŸŽ­ Field Trick #8: The "Referential Integrity Magic" + +### **The Setup** +Show how deterministic masking preserves JOIN capabilities for analytics while protecting PII. + +### **The Script** +> *"Your marketing team needs to analyze customer transaction patterns. But GDPR says they can't see real names or IDs. Most masking breaks database joins - random tokens don't match across tables. Watch our deterministic masking..."* + +```sql +-- Marketing analyst performing cross-table analytics +SELECT + c.CustomerID, -- Masked deterministically + c.FirstName, -- Masked as J*** + COUNT(t.TransactionID) as transaction_count, + AVG(t.Amount) as avg_transaction, + COUNT(DISTINCT a.AccountID) as account_count +FROM fincat.finance.Customers c +JOIN fincat.finance.Accounts a ON c.CustomerID = a.CustomerID +JOIN fincat.finance.Transactions t ON a.AccountID = t.AccountID +GROUP BY c.CustomerID, c.FirstName +ORDER BY transaction_count DESC +LIMIT 5; +``` + +**Marketing sees:** +``` +REF_c8a9f2... | J*** | 23 | $1,200.00 | 2 +REF_2b771f... | M*** | 18 | $850.00 | 1 +REF_40c7ac... | S*** | 31 | $2,100.00 | 3 +``` + +### **The Revelation** +> *"Notice what just happened - we joined across THREE tables using masked customer IDs, and every relationship remained intact. The same `REF_c8a9f2...` appears consistently wherever that customer's data exists. Marketing can build customer segments, identify high-value customers, and train machine learning models - all on protected data. The analytics work, the JOINs work, but the PII is protected. This is GDPR-compliant analytics that actually works."* + +### **The Business Impact** +- **Analytics Enabled**: Marketing can do real analysis without PII exposure +- **ML Training**: Models train on real relationship patterns with protected identities +- **Cost Savings**: No need for expensive synthetic data or separate analytics environments + +### **Field Trick**: Show the deterministic token in multiple query results - prove it's the same token for the same customer. + +--- + +## šŸŽ­ Field Trick #9: The "Before You Leave" Close + +### **The Urgency Builder** +> *"Before you leave this room, I want you to imagine three scenarios:"* + +1. **"It's next month, and your PCI-DSS audit takes 2 days instead of 2 weeks because every access is automatically logged and policy-enforced. How much did you just save?"** + +2. **"It's next quarter, and the SEC asks about your Chinese wall controls. You show them the ABAC policies that physically prevent research from seeing trading data. They nod and leave. How does that feel?"** + +3. **"It's next year, and you've had zero GDPR violations, zero data residency breaches, zero audit findings. Your compliance team has time to focus on strategy instead of firefighting. What's that worth?"** + +### **The Action Trigger** +> *"The question isn't whether you need better financial data governance. The question is: how much is your next breach, your next audit failure, your next regulatory fine going to cost? Because with ABAC, those risks just became preventable."* + +### **Field Trick**: End with emotion, not technology. Paint the picture of their better future - compliant, secure, and profitable. + +--- + +## šŸ› ļø Demo Environment Setup Tricks + +### **Pre-Demo Checklist** +- [ ] **Backup Screenshots**: For every scenario, have screenshots ready in case live demo fails +- [ ] **Multiple User Sessions**: Pre-login different roles in separate browser tabs or terminal sessions +- [ ] **Query Shortcuts**: Save common queries as snippets for quick execution +- [ ] **Time Zone Awareness**: Adjust market hours demo based on actual current time +- [ ] **Network Backup**: Have mobile hotspot ready for connectivity issues +- [ ] **Data Refresh**: Ensure sample data is recent and realistic + +### **The "Demo Gods" Insurance Policy** +- Always test queries 30 minutes before the demo +- Have a colleague run through the full sequence +- Prepare 3 backup ways to show each key concept +- Know your audience's timezone for time-based demos +- Have a "demo reset" script to restore state + +--- + +## šŸŽÆ Audience-Specific Adaptations + +### **For CISOs** (Security-First) +- Lead with breach prevention ($80M Capital One example) +- Show PCI-DSS and SEC compliance automation +- Emphasize Chinese wall enforcement for insider trading prevention +- Focus on audit trails and incident response + +### **For CFOs** (Cost-First) +- Show audit cost reduction (2 weeks → 2 days = $150K saved) +- Demonstrate regulatory fine avoidance ($50M SEC Chinese wall violations) +- Highlight PCI-DSS compliance cost savings ($500K/year) +- Prove ROI with hard numbers + +### **For CROs** (Risk-First) +- Show risk reduction metrics (zero GDPR violations) +- Demonstrate AML investigation efficiency (50% faster) +- Highlight breach prevention (Capital One-scale events) +- Focus on regulatory compliance automation + +### **For CTOs** (Architecture-First) +- Show scalability across 47 countries with one catalog +- Demonstrate performance with no query overhead +- Highlight API integration capabilities +- Focus on unified policy management + +### **For Compliance Officers** (Regulation-First) +- Lead with regulatory requirement coverage (PCI-DSS, GDPR, SOX, SEC, MiFID II) +- Show automated audit trail generation +- Demonstrate policy version control and documentation +- Focus on multi-jurisdiction compliance (EU, US, APAC) + +### **For Data Scientists** (Analytics-First) +- Show how ABAC enables rather than blocks analytics +- Demonstrate cross-table JOINs with deterministic masking +- Highlight privacy-preserving ML training +- Focus on marketing and customer analytics capabilities + +--- + +## šŸ† The Demo Success Formula + +### **Opening** (2 minutes) +1. **Pain Recognition**: "3 AM fraud alert - you've lived this" +2. **Regulatory Reality**: "PCI-DSS isn't optional" +3. **Cost Quantification**: "$80 million breach - it's happened" +4. **Solution Promise**: "Let me show you prevention" + +### **Demonstration** (15 minutes) +1. **PCI-DSS Card Masking**: Same query, different card data (2 min) +2. **Chinese Wall Proof**: Research blocked from trading (2 min) +3. **AML Escalation**: Progressive investigation access (2 min) +4. **GDPR Geographic Lockdown**: EU data stays in EU (2 min) +5. **Trading Hours Restriction**: Time-based P&L access (2 min) +6. **Temporary Auditor Expiry**: Self-destructing access (2 min) +7. **Referential Integrity**: Cross-table analytics work (3 min) + +### **Close** (5 minutes) +1. **Summarize Value**: "Seven scenarios, billions in risk reduction" +2. **Quantify ROI**: "$4.2M in cost avoidance, first year" +3. **Address Concerns**: "Implementation is 2-3 weeks, not months" +4. **Create Urgency**: "Your next audit is when?" +5. **Define Next Steps**: "Proof of concept starts tomorrow" + +--- + +## šŸŽŖ The Master Demo Sequence + +### **The 7-Act Financial Services Play** +1. **Act 1**: The Breach (PCI-DSS card data leak scenario) +2. **Act 2**: The Magic Query (Same SQL, different card masking) +3. **Act 3**: The Chinese Wall (Research blocked from trading) +4. **Act 4**: The AML Escalation (Progressive investigation access) +5. **Act 5**: The Geographic Lockdown (GDPR enforcement in action) +6. **Act 6**: The Audit Trail (Compliance officer's view) +7. **Act 7**: The ROI Revelation (Cost savings and risk reduction) + +### **Timing is Everything** +- **5 seconds**: Time to capture attention with 3 AM fraud story +- **30 seconds**: Maximum time for any single query to execute +- **2 minutes**: Maximum time on any single scenario +- **15 seconds**: Pause time after major reveals for impact +- **5 minutes**: Buffer time for questions and discussion + +--- + +## šŸ’° ROI Metrics Library + +Connect every demo moment to dollars: + +- **PCI-DSS Compliance**: $500K/year audit cost reduction (automated controls) +- **Breach Prevention**: $80M Capital One-scale breach avoided +- **Chinese Wall Violations**: $50M SEC fine prevention +- **AML Investigation**: 50% faster investigations = $200K/year savings +- **GDPR Compliance**: €20M fine avoidance (4% revenue penalty) +- **Audit Cost Reduction**: $150K/year (2 weeks → 2 days) +- **Temporary Access Management**: $80K/year (40 hours/month saved) +- **Cross-Border Compliance**: $500K/year multi-jurisdiction management + +**Cumulative Impact**: > *"We just demonstrated $82M in breach prevention and $1.4M in annual compliance cost savings. Your ABAC investment pays for itself in 60 days."* + +--- + +## šŸŽ­ Final Field Wisdom + +### **The Golden Rules** +1. **Always tell a story, never just show features** - Start with the 3 AM fraud alert +2. **Make it about their risk, not your technology** - Regulators audit them, not you +3. **Show don't tell - then tell what you showed** - Query results speak louder than slides +4. **Practice the pause** - Silence after "0 rows returned" builds impact +5. **Connect every feature to a dollar sign or fine** - $80M breach or €20M GDPR penalty +6. **Prepare for failure** - Have screenshots ready, demo gods are fickle +7. **End with emotion and urgency** - "Your next audit is when?" + +### **The Demo Ninja Mindset** +> *"I'm not here to show you software. I'm here to show you a future where PCI-DSS audits take days not weeks, where SEC Chinese wall inquiries get answered with data not promises, where GDPR compliance is automatic not aspirational. Where your compliance team becomes strategic advisors instead of policy police. Where your next breach doesn't happen because the data simply isn't accessible to those who shouldn't see it."* + +--- + +**šŸŽÆ Remember: Great demos don't sell software - they sell freedom from fear of the next regulatory fine.** + +--- + +## šŸŒ Industry Variations + +While this guide focuses on financial services, ABAC patterns apply across industries: + +### šŸ„ **Healthcare** (See healthcare demo guide) +- HIPAA instead of PCI-DSS +- PHI instead of card data +- Doctor/nurse roles instead of traders/analysts + +### šŸ­ **Manufacturing** +- IP protection instead of customer PII +- Supplier data segregation instead of geographic residency +- Patent/design access controls instead of trading positions + +### šŸ›’ **Retail** +- Customer purchase history instead of transactions +- PCI-DSS for e-commerce payments +- Marketing segmentation with GDPR compliance + +### šŸš€ **Want to Contribute?** + +We're always looking for real-world ABAC use cases from financial services. If you have: +- **Industry-specific compliance scenarios** that would make compelling demos +- **Real customer pain points** from banking, payments, trading, or insurance +- **Field experience** with ABAC implementations in financial services +- **Demo techniques** that have won deals + +**Reach out to us!** We'd love to expand this collection. + +--- + +*Now go forth and demo with confidence - and may the compliance gods smile upon you!* šŸ¦šŸ”šŸ’° diff --git a/uc-quickstart/utils/abac/finance/ABAC_Performance_Finance.md b/uc-quickstart/utils/abac/finance/ABAC_Performance_Finance.md new file mode 100644 index 00000000..bcbb121a --- /dev/null +++ b/uc-quickstart/utils/abac/finance/ABAC_Performance_Finance.md @@ -0,0 +1,670 @@ +# āš ļø ABAC Performance Anti-Patterns: Finance Domain + +## šŸŽÆ Critical Performance Guidelines for Financial Services ABAC + +### 🚨 The Financial Services Performance Reality + +**Financial services operates at millisecond scale.** Trading systems process thousands of transactions per second. A slow ABAC policy can cost millions in lost trading opportunities or cause audit queries to timeout. Poor function design turns compliance from enabler to bottleneck. + +> **Key Principle**: ABAC policies run on EVERY query execution. In high-frequency trading environments, even 1ms of overhead multiplied by millions of queries becomes unacceptable. + +--- + +## šŸ”“ FINANCE-SPECIFIC ANTI-PATTERNS + +### āŒ Anti-Pattern #1: Real-Time Trading Position Calculations + +**What NOT to Do:** +```sql +-- NEVER DO THIS - Complex P&L calculation in mask function +CREATE OR REPLACE FUNCTION mask_pnl_with_realtime_calc( + position_id STRING, + entry_price DECIMAL, + quantity INT +) +RETURNS DECIMAL +DETERMINISTIC +RETURN + CASE + WHEN user_role = 'TRADER' THEN + (SELECT current_price FROM market_data.live_prices WHERE symbol = + (SELECT security_id FROM positions WHERE position_id = position_id) + ) * quantity - (entry_price * quantity) + ELSE NULL + END; +``` + +**Why This Destroys Performance:** +- External market data lookup for every row +- Nested subquery for security lookup +- Calculations repeated for every masked value +- No caching possible +- Blocks query optimization + +**Performance Impact:** šŸ”„ **10,000x+ slower** (External data fetch per row) + +**Correct Approach:** +```sql +-- āœ… GOOD - Mask the stored P&L value, don't recalculate it +CREATE OR REPLACE FUNCTION mask_pnl_stored(pnl_value DECIMAL) +RETURNS DECIMAL +DETERMINISTIC +RETURN + CASE + WHEN pnl_value IS NULL THEN NULL + ELSE ROUND(pnl_value, -2) -- Round to nearest 100 for restricted roles + END; +``` + +--- + +### āŒ Anti-Pattern #2: Customer Credit Score Lookups + +**What NOT to Do:** +```sql +-- NEVER DO THIS - Credit bureau lookup in filter function +CREATE OR REPLACE FUNCTION filter_by_creditworthiness(customer_id STRING) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + ( + SELECT credit_score + FROM external_credit_bureau.scores + WHERE customer_id = customer_id + AND score_date = CURRENT_DATE() + ) >= 650; +``` + +**Why This Kills Performance:** +- External credit bureau API call per row +- Network latency multiplied by row count +- Expensive third-party API costs +- Single point of failure +- No result caching + +**Performance Impact:** šŸ”„ **100,000x slower** + **$$$$ API costs** + +**Correct Approach:** +```sql +-- āœ… GOOD - Filter based on stored risk score column +CREATE OR REPLACE FUNCTION filter_by_risk_score( + customer_risk_score INT, + required_score INT +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN customer_risk_score >= required_score; + +-- Pre-compute and store credit scores in your database +-- Use batch ETL to refresh from credit bureau daily, not per-query +``` + +--- + +### āŒ Anti-Pattern #3: AML Transaction Pattern Analysis + +**What NOT to Do:** +```sql +-- NEVER DO THIS - Complex AML pattern detection in row filter +CREATE OR REPLACE FUNCTION filter_suspicious_transactions( + customer_id STRING, + transaction_id STRING, + amount DECIMAL +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + -- Check for structuring (multiple transactions under $10k) + WHEN ( + SELECT COUNT(*) + FROM transactions + WHERE customer_id = customer_id + AND transaction_date = CURRENT_DATE() + AND amount < 10000 + ) > 3 THEN FALSE + -- Check for rapid movement across borders + WHEN ( + SELECT COUNT(DISTINCT country_code) + FROM transactions + WHERE customer_id = customer_id + AND transaction_date >= CURRENT_DATE() - INTERVAL 7 DAYS + ) > 5 THEN FALSE + ELSE TRUE + END; +``` + +**Why This Breaks Everything:** +- Multiple complex subqueries per row +- Correlated subqueries prevent parallelization +- Date range queries per transaction +- Cartesian product explosion risk +- Impossible to optimize + +**Performance Impact:** šŸ”„ **50,000x slower** (Multiple subqueries per row) + +**Correct Approach:** +```sql +-- āœ… GOOD - Filter based on pre-computed AML flag column +CREATE OR REPLACE FUNCTION filter_by_aml_flag( + aml_flag_level STRING, + user_clearance STRING +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + WHEN aml_flag_level = 'NONE' THEN TRUE + WHEN aml_flag_level = 'LOW' AND user_clearance IN ('ANALYST', 'SENIOR', 'OFFICER') THEN TRUE + WHEN aml_flag_level = 'HIGH' AND user_clearance IN ('SENIOR', 'OFFICER') THEN TRUE + ELSE FALSE + END; + +-- Run AML pattern detection as separate batch job +-- Store results in aml_flag_level column +-- ABAC policies filter based on stored flags, not live analysis +``` + +--- + +### āŒ Anti-Pattern #4: Card BIN Lookup for Issuer Information + +**What NOT to Do:** +```sql +-- NEVER DO THIS - BIN database lookup in mask function +CREATE OR REPLACE FUNCTION mask_card_with_issuer(card_number STRING) +RETURNS STRING +DETERMINISTIC +RETURN + CASE + WHEN ( + SELECT issuer_name + FROM card_bin_database.issuers + WHERE bin = SUBSTRING(card_number, 1, 6) + ) IN ('Visa', 'Mastercard') THEN CONCAT('XXXX-XXXX-XXXX-', RIGHT(card_number, 4)) + ELSE 'XXXX-XXXX-XXXX-XXXX' + END; +``` + +**Why This Kills Performance:** +- BIN lookup for every card number +- Database JOIN per row +- External table dependency +- Prevents column pruning + +**Performance Impact:** šŸ”„ **1,000x slower** (Lookup per masked value) + +**Correct Approach:** +```sql +-- āœ… GOOD - Mask based on stored card type column +CREATE OR REPLACE FUNCTION mask_card_by_type( + card_number STRING, + card_type STRING, + user_clearance STRING +) +RETURNS STRING +DETERMINISTIC +RETURN + CASE + WHEN user_clearance = 'FULL' THEN card_number + WHEN user_clearance = 'BASIC' AND card_type IS NOT NULL + THEN CONCAT('XXXX-XXXX-XXXX-', RIGHT(card_number, 4)) + ELSE 'XXXX-XXXX-XXXX-XXXX' + END; + +-- Store card_type when card is added to database +-- No runtime lookups needed +``` + +--- + +### āŒ Anti-Pattern #5: Exchange Rate Conversions in Amount Masking + +**What NOT to Do:** +```sql +-- NEVER DO THIS - Currency conversion in mask function +CREATE OR REPLACE FUNCTION mask_amount_usd_converted( + amount DECIMAL, + currency STRING +) +RETURNS DECIMAL +DETERMINISTIC +RETURN + CASE + WHEN currency = 'USD' THEN ROUND(amount, -2) + ELSE ROUND( + amount * ( + SELECT rate + FROM forex.exchange_rates + WHERE from_currency = currency + AND to_currency = 'USD' + AND rate_date = CURRENT_DATE() + ), + -2 + ) + END; +``` + +**Why This Destroys Performance:** +- Forex rate lookup per transaction +- Date-based queries per row +- External table dependency +- No rate caching + +**Performance Impact:** šŸ”„ **5,000x slower** (Forex lookup per amount) + +**Correct Approach:** +```sql +-- āœ… GOOD - Mask the stored amount in original currency +CREATE OR REPLACE FUNCTION mask_amount_rounded( + amount DECIMAL, + sensitivity_level STRING +) +RETURNS DECIMAL +DETERMINISTIC +RETURN + CASE + WHEN sensitivity_level = 'PUBLIC' THEN amount + WHEN amount < 100 THEN ROUND(amount, -1) + ELSE ROUND(amount, -2) + END; + +-- Pre-convert amounts to USD in ETL if needed +-- Store both original and USD amounts as columns +-- Mask the stored values, don't convert at query time +``` + +--- + +### āŒ Anti-Pattern #6: Account Balance Aggregation in Filter + +**What NOT to Do:** +```sql +-- NEVER DO THIS - Account rollup in row filter +CREATE OR REPLACE FUNCTION filter_high_value_customers(customer_id STRING) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + ( + SELECT SUM(balance) + FROM accounts + WHERE customer_id = customer_id + AND account_status = 'Active' + ) >= 100000; +``` + +**Why This Kills Performance:** +- Aggregation query per customer row +- Cross-table dependency +- Prevents parallel processing +- No optimization possible + +**Performance Impact:** šŸ”„ **10,000x slower** (Aggregation per row) + +**Correct Approach:** +```sql +-- āœ… GOOD - Filter based on pre-computed customer tier +CREATE OR REPLACE FUNCTION filter_by_customer_tier( + customer_tier STRING, + required_tier STRING +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + WHEN customer_tier = 'PLATINUM' THEN TRUE + WHEN customer_tier = 'GOLD' AND required_tier IN ('GOLD', 'SILVER', 'BRONZE') THEN TRUE + WHEN customer_tier = 'SILVER' AND required_tier IN ('SILVER', 'BRONZE') THEN TRUE + WHEN customer_tier = 'BRONZE' AND required_tier = 'BRONZE' THEN TRUE + ELSE FALSE + END; + +-- Compute customer tiers in batch ETL +-- Store as customer_tier column +-- Update nightly or as accounts change +``` + +--- + +## āœ… FINANCE-OPTIMIZED PATTERNS + +### šŸš€ High-Performance Trading Position Filter + +```sql +-- āœ… EXCELLENT - Pure column-based trading desk filtering +CREATE OR REPLACE FUNCTION filter_trading_desk_access( + position_desk STRING, + information_barrier STRING, + user_desk STRING, + user_barrier STRING +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + -- Neutral roles (Risk, Compliance) see everything + WHEN user_barrier = 'Neutral' THEN TRUE + + -- Same desk access + WHEN position_desk = user_desk AND information_barrier = user_barrier THEN TRUE + + -- Block cross-barrier access (Chinese wall) + WHEN information_barrier != user_barrier THEN FALSE + + ELSE FALSE + END; +``` + +**Why This Works:** +- Pure column comparisons - no lookups +- No external dependencies +- Fully optimizable by Spark +- Vectorizes efficiently +- Enables predicate pushdown + +**Performance Impact:** āœ… **Native speed** (< 1ms overhead) + +--- + +### šŸš€ High-Performance PCI-DSS Card Masking + +```sql +-- āœ… EXCELLENT - Simple string operations for card masking +CREATE OR REPLACE FUNCTION mask_card_pci( + card_number STRING, + pci_clearance STRING +) +RETURNS STRING +DETERMINISTIC +RETURN + CASE + WHEN pci_clearance = 'FULL' THEN card_number + WHEN pci_clearance = 'BASIC' THEN + CONCAT('XXXX-XXXX-XXXX-', RIGHT(REGEXP_REPLACE(card_number, '[^0-9]', ''), 4)) + WHEN pci_clearance = 'NONE' THEN 'XXXX-XXXX-XXXX-XXXX' + ELSE 'XXXX-XXXX-XXXX-XXXX' + END; +``` + +**Why This Works:** +- Built-in string functions only +- No external calls or lookups +- Deterministic and cacheable +- Simple CASE logic +- Minimal CPU overhead + +**Performance Impact:** āœ… **Near-native** (< 0.1ms per value) + +--- + +### šŸš€ High-Performance Geographic Residency Filter + +```sql +-- āœ… EXCELLENT - Simple region matching +CREATE OR REPLACE FUNCTION filter_data_residency( + customer_region STRING, + data_residency STRING, + user_region STRING +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + -- Global access + WHEN user_region = 'Global' THEN TRUE + + -- Exact region match + WHEN customer_region = user_region THEN TRUE + + -- Public data accessible by all + WHEN data_residency = 'Public' THEN TRUE + + ELSE FALSE + END; +``` + +**Why This Works:** +- Simple string equality checks +- No subqueries or joins +- Spark can optimize predicate +- Enables partition pruning +- Vectorizes perfectly + +**Performance Impact:** āœ… **Negligible overhead** (< 0.01ms) + +--- + +### šŸš€ High-Performance AML Clearance Filter + +```sql +-- āœ… EXCELLENT - Integer comparison for clearance levels +CREATE OR REPLACE FUNCTION filter_aml_access( + data_sensitivity INT, + user_clearance INT, + aml_flag STRING +) +RETURNS BOOLEAN +DETERMINISTIC +RETURN + CASE + -- High clearance sees everything + WHEN user_clearance >= 5 THEN TRUE + + -- Match clearance to sensitivity + WHEN user_clearance >= data_sensitivity THEN TRUE + + -- Block flagged data from low clearance + WHEN aml_flag = 'SUSPICIOUS' AND user_clearance < 3 THEN FALSE + + ELSE FALSE + END; +``` + +**Why This Works:** +- Integer comparisons (fastest operations) +- No string parsing or lookups +- Logical operators only +- Fully indexable +- Branch prediction friendly + +**Performance Impact:** āœ… **Optimal** (< 0.001ms) + +--- + +## šŸ“Š Finance Performance Benchmarks + +| Pattern Type | Query Time | Scalability | Trading Systems Compatible | +|-------------|------------|-------------|---------------------------| +| āŒ Real-time position calc | 10+ seconds | Breaks | No - unacceptable | +| āŒ Credit score lookup | 5-10 seconds | Poor | No - too slow | +| āŒ AML pattern analysis | 1-5 seconds | Poor | No - timeouts | +| āŒ BIN database lookup | 500ms-2s | Limited | No - high latency | +| āŒ Currency conversion | 500ms-1s | Poor | No - variable latency | +| āœ… Simple column logic | 1-10ms | Excellent | Yes - acceptable | +| āœ… Integer comparisons | < 1ms | Excellent | Yes - optimal | + +**Trading System Requirement**: < 10ms query overhead for position queries +**Compliance Reporting**: < 100ms acceptable for audit queries +**Real-time Fraud**: < 50ms for card authorization queries + +--- + +## šŸŽÆ Finance ABAC Golden Rules + +### **The 8 Commandments for Financial Services** + +1. **Pre-Compute, Don't Calculate**: AML flags, risk scores, customer tiers - compute once, filter many +2. **Store, Don't Lookup**: Card types, account balances, position P&L - store in columns +3. **Filter Columns, Don't Join Tables**: Use column values, not subqueries +4. **Simple Logic, Fast Execution**: String equality and integer comparisons beat complex calculations +5. **Batch ETL, Not Real-Time**: Update risk scores nightly, not per-query +6. **Deterministic Always**: Same input = same output, enables caching +7. **Test at Trading Scale**: 1 million rows minimum, 10 million for HFT systems +8. **Monitor Query Plans**: EXPLAIN every ABAC query to verify optimization + +--- + +## šŸ”§ Financial Services Performance Testing + +### **Load Test Template for Trading Systems** + +```sql +-- High-frequency trading simulation (1M positions) +WITH test_positions AS ( + SELECT + CONCAT('POS', LPAD(seq, 8, '0')) as position_id, + CASE WHEN MOD(seq, 4) = 0 THEN 'Equity' + WHEN MOD(seq, 4) = 1 THEN 'Fixed_Income' + WHEN MOD(seq, 4) = 2 THEN 'FX' + ELSE 'Commodities' + END as trading_desk, + RAND() * 1000000 as pnl, + current_timestamp() as test_start + FROM range(1000000) +) +SELECT + COUNT(*) as rows_processed, + MAX(test_start) as end_time, + CAST(COUNT(*) / + (UNIX_TIMESTAMP(MAX(test_start)) - UNIX_TIMESTAMP(MIN(test_start))) + AS BIGINT) as rows_per_second +FROM test_positions +WHERE trading_desk = 'Equity'; -- Simulates desk filtering +``` + +### **Performance Targets for Financial Services** + +- **Trading Position Queries**: > 100,000 rows/second +- **Card Transaction Masking**: > 500,000 rows/second +- **Customer Data Filtering**: > 1,000,000 rows/second +- **Query Overhead**: < 5% additional latency +- **Memory Usage**: < 1.5x baseline query + +--- + +## 🚨 Emergency Performance Recovery + +### **When ABAC Policies Kill Trading Performance** + +1. **Immediate Action**: + - Identify slow policy with query profiling + - Check for external lookups or subqueries + - Temporarily disable specific policy (not entire ABAC) + +2. **Diagnosis**: +```sql +-- Analyze query plan +EXPLAIN EXTENDED +SELECT * FROM fincat.finance.TradingPositions LIMIT 100; + +-- Look for: +-- - Correlated subqueries +-- - External table joins in mask functions +-- - Non-deterministic operations +``` + +3. **Fix**: Rewrite using performance patterns above + +4. **Validation**: Load test with 1M+ rows before re-enabling + +--- + +## šŸ’” Finance-Specific Optimization Tips + +### **For High-Frequency Trading Systems** +- Use integer-based clearance levels, not string comparisons +- Pre-filter positions by desk in ETL, use ABAC for secondary filtering +- Cache position snapshots, don't query live data in filters +- Minimize row filters, prefer column masking + +### **For Card Payment Processing** +- Mask card numbers client-side when possible, not in database +- Store masked versions alongside encrypted versions +- Use column-level encryption + ABAC masking together +- Pre-validate PCI clearance, don't check per-query + +### **For AML Compliance Reporting** +- Run pattern detection in batch (hourly/daily) +- Store results in investigation_status column +- ABAC filters based on stored flags, not live analysis +- Separate real-time monitoring from historical reporting + +### **For Cross-Border Operations** +- Partition tables by customer_region +- Use region-based clusters when possible +- Leverage Spark partition pruning with region filters +- Consider materialized views per region + +--- + +## šŸ“‹ Pre-Production Performance Checklist + +Before deploying finance ABAC to production: + +- [ ] All mask functions use only built-in SQL functions +- [ ] No external API calls or network operations +- [ ] No correlated subqueries or table joins in functions +- [ ] All row filters use column comparisons only +- [ ] Risk scores and tiers pre-computed and stored +- [ ] Tested with 1M+ rows per table minimum +- [ ] Query plans reviewed and optimized +- [ ] Performance monitoring in place +- [ ] Rollback plan documented +- [ ] Trading desk approved performance impact + +--- + +## šŸŽÆ Finance ABAC Architecture Principles + +### **Layered Security Without Performance Penalty** + +``` +Layer 1: Data Classification (At Rest) +ā”œā”€ā”€ Pre-compute risk scores, customer tiers, AML flags +ā”œā”€ā”€ Store classification in columns +└── ETL runs nightly or on trigger events + +Layer 2: ABAC Policies (Query Time) +ā”œā”€ā”€ Filter based on stored columns +ā”œā”€ā”€ Mask using simple string operations +└── Pure column logic, no external calls + +Layer 3: Monitoring (Continuous) +ā”œā”€ā”€ Query performance metrics +ā”œā”€ā”€ Policy effectiveness tracking +└── Compliance audit logging +``` + +**Result**: Security that scales to millions of queries per second without becoming a bottleneck. + +--- + +**šŸŽÆ Remember: In financial services, milliseconds are money. Great ABAC is invisible ABAC - secure by default, fast by design.** + +--- + +## šŸ¦ Finance-Specific Test Scenarios + +### **Scenario 1: High-Frequency Trading Query** +- **Volume**: 10,000 queries/second +- **Target**: < 10ms per query +- **Test**: Position filtering by trading desk + +### **Scenario 2: Card Authorization** +- **Volume**: 50,000 transactions/second +- **Target**: < 50ms per authorization +- **Test**: PCI-DSS card number masking + +### **Scenario 3: AML Batch Report** +- **Volume**: 10 million transactions +- **Target**: < 5 minutes total +- **Test**: Transaction filtering by clearance level + +### **Scenario 4: Customer Analytics** +- **Volume**: 100 million customer records +- **Target**: < 30 seconds for aggregations +- **Test**: Cross-table joins with deterministic masking + +--- + +**If your ABAC policies pass all four scenarios, you're ready for production financial services deployment.** šŸš€ diff --git a/uc-quickstart/utils/genie/aws/.gitignore b/uc-quickstart/utils/genie/aws/.gitignore new file mode 100644 index 00000000..d95b0429 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/.gitignore @@ -0,0 +1,20 @@ +# Local import IDs (copy from import_ids.env.example) +import_ids.env + +# Terraform state (may contain secrets) +*.tfstate +*.tfstate.backup +.terraform/ + +# User-specific credentials (only track the .example) +auth.auto.tfvars + +# Auto-fetched DDLs (user-specific) +ddl/_fetched.sql + +# AI-generated output (user-specific) +generated/ +masking_functions.sql + +# Auto-created Genie Space ID (managed by Terraform lifecycle) +.genie_space_id diff --git a/uc-quickstart/utils/genie/aws/.terraform.lock.hcl b/uc-quickstart/utils/genie/aws/.terraform.lock.hcl new file mode 100644 index 00000000..7cc54e78 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/.terraform.lock.hcl @@ -0,0 +1,56 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/databricks/databricks" { + version = "1.91.0" + constraints = "~> 1.91.0" + hashes = [ + "h1:T/COpKP/npWNyJqRB/Nppbg8GVZrzs9WyikS/vB4bKw=", + "zh:00a9e9ec95285a5e5bdd9940a342bf04c97a966bf088fc1eef14e8fda1208bfe", + "zh:7f9b169d43c5ed616d26f60f2f4126966228f2cc6c5ea900c6c2da27501f264f", + "zh:93a0f663981783d32f892d9ef27e9b21a8502ad42c044e91f02a3465a7adb0d8", + "zh:a82aad14d36adfc9326bdf283a20cc5d199887db8b20687636e96710504d9613", + "zh:bd5999d0030eb06fc893ff4b8440d4aa6e8aafec9a14bffe3629daf673a8e2e9", + "zh:c03acdd937a78850d33dd83b36659b040f1a1a0f55e458199e7aaa710b0b201f", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.4" + constraints = "~> 3.2" + hashes = [ + "h1:L5V05xwp/Gto1leRryuesxjMfgZwjb7oool4WS1UEFQ=", + "zh:59f6b52ab4ff35739647f9509ee6d93d7c032985d9f8c6237d1f8a59471bbbe2", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:795c897119ff082133150121d39ff26cb5f89a730a2c8c26f3a9c1abf81a9c43", + "zh:7b9c7b16f118fbc2b05a983817b8ce2f86df125857966ad356353baf4bff5c0a", + "zh:85e33ab43e0e1726e5f97a874b8e24820b6565ff8076523cc2922ba671492991", + "zh:9d32ac3619cfc93eb3c4f423492a8e0f79db05fec58e449dee9b2d5873d5f69f", + "zh:9e15c3c9dd8e0d1e3731841d44c34571b6c97f5b95e8296a45318b94e5287a6e", + "zh:b4c2ab35d1b7696c30b64bf2c0f3a62329107bd1a9121ce70683dec58af19615", + "zh:c43723e8cc65bcdf5e0c92581dcbbdcbdcf18b8d2037406a5f2033b1e22de442", + "zh:ceb5495d9c31bfb299d246ab333f08c7fb0d67a4f82681fbf47f2a21c3e11ab5", + "zh:e171026b3659305c558d9804062762d168f50ba02b88b231d20ec99578a6233f", + "zh:ed0fe2acdb61330b01841fa790be00ec6beaac91d41f311fb8254f74eb6a711f", + ] +} + +provider "registry.terraform.io/hashicorp/time" { + version = "0.13.1" + constraints = "~> 0.12" + hashes = [ + "h1:ZT5ppCNIModqk3iOkVt5my8b8yBHmDpl663JtXAIRqM=", + "zh:02cb9aab1002f0f2a94a4f85acec8893297dc75915f7404c165983f720a54b74", + "zh:04429b2b31a492d19e5ecf999b116d396dac0b24bba0d0fb19ecaefe193fdb8f", + "zh:26f8e51bb7c275c404ba6028c1b530312066009194db721a8427a7bc5cdbc83a", + "zh:772ff8dbdbef968651ab3ae76d04afd355c32f8a868d03244db3f8496e462690", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:898db5d2b6bd6ca5457dccb52eedbc7c5b1a71e4a4658381bcbb38cedbbda328", + "zh:8de913bf09a3fa7bedc29fec18c47c571d0c7a3d0644322c46f3aa648cf30cd8", + "zh:9402102c86a87bdfe7e501ffbb9c685c32bbcefcfcf897fd7d53df414c36877b", + "zh:b18b9bb1726bb8cfbefc0a29cf3657c82578001f514bcf4c079839b6776c47f0", + "zh:b9d31fdc4faecb909d7c5ce41d2479dd0536862a963df434be4b16e8e4edc94d", + "zh:c951e9f39cca3446c060bd63933ebb89cedde9523904813973fbc3d11863ba75", + "zh:e5b773c0d07e962291be0e9b413c7a22c044b8c7b58c76e8aa91d1659990dfb5", + ] +} diff --git a/uc-quickstart/utils/genie/aws/ABAC_PROMPT.md b/uc-quickstart/utils/genie/aws/ABAC_PROMPT.md new file mode 100644 index 00000000..f78dd6f0 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/ABAC_PROMPT.md @@ -0,0 +1,372 @@ +# ABAC Configuration Generator — AI Prompt Template + +Copy everything below the line into ChatGPT, Claude, or Cursor. Paste your table DDL / `DESCRIBE TABLE` output where indicated. The AI will generate: + +1. **`masking_functions.sql`** — SQL UDFs for your masking and row-filter requirements +2. **`abac.auto.tfvars`** — A complete variable file ready for `terraform apply` + +--- + +## Prompt (copy from here) + +You are an expert in Databricks Unity Catalog Attribute-Based Access Control (ABAC). I will give you my table schemas from any industry or domain. You will analyze the columns for sensitivity (PII, financial, health, compliance, proprietary, etc.), then generate two files: + +### What is ABAC? + +ABAC uses governed **tags** on tables/columns and **FGAC policies** (column masks + row filters) to control data access based on **group membership**. The flow is: + +1. Create **groups** (access tiers like "Junior_Analyst", "Admin") +2. Create **tag policies** (e.g., `sensitivity` with values `public`, `confidential`, `restricted`) +3. Assign **tags** to tables and columns +4. Create **FGAC policies** that match tagged columns/tables and apply masking functions for specific groups + +### Available Masking Function Patterns + +Use these signatures. Replace `{catalog}.{schema}` with the user's catalog and schema. + +**PII:** +- `mask_pii_partial(input STRING) RETURNS STRING` — first + last char visible, middle masked +- `mask_ssn(ssn STRING) RETURNS STRING` — last 4 digits of SSN visible +- `mask_email(email STRING) RETURNS STRING` — masks local part, keeps domain +- `mask_phone(phone STRING) RETURNS STRING` — last 4 digits visible +- `mask_full_name(name STRING) RETURNS STRING` — reduces to initials + +**Financial:** +- `mask_credit_card_full(card_number STRING) RETURNS STRING` — all digits hidden +- `mask_credit_card_last4(card_number STRING) RETURNS STRING` — last 4 visible +- `mask_account_number(account_id STRING) RETURNS STRING` — deterministic SHA-256 token +- `mask_amount_rounded(amount DECIMAL(18,2)) RETURNS DECIMAL(18,2)` — round to nearest 10/100 +- `mask_iban(iban STRING) RETURNS STRING` — country code + last 4 + +**Health:** +- `mask_mrn(mrn STRING) RETURNS STRING` — last 4 digits of MRN +- `mask_diagnosis_code(code STRING) RETURNS STRING` — ICD category visible, specifics hidden + +**General:** +- `mask_redact(input STRING) RETURNS STRING` — replace with `[REDACTED]` +- `mask_hash(input STRING) RETURNS STRING` — full SHA-256 hash +- `mask_nullify(input STRING) RETURNS STRING` — return NULL + +**Row Filters (zero-argument, must be self-contained):** + +Row filter functions take no arguments and return BOOLEAN. They must be **fully +self-contained** — every function they call must either be a Databricks built-in +or must also be defined in the same SQL file (before the caller). Do NOT reference +undefined helper functions like `get_current_user_metadata`. + +Common patterns with example implementations: + +- `filter_by_region_us() RETURNS BOOLEAN` — placeholder for US region filtering. `RETURN TRUE;` +- `filter_by_region_eu() RETURNS BOOLEAN` — placeholder for EU region filtering. `RETURN TRUE;` +- `filter_by_region_apac() RETURNS BOOLEAN` — placeholder for APAC region filtering. `RETURN TRUE;` +- `filter_trading_hours() RETURNS BOOLEAN` — restrict to non-market hours. `RETURN HOUR(NOW()) < 9 OR HOUR(NOW()) > 16;` +- `filter_audit_expiry() RETURNS BOOLEAN` — time-limited access. `RETURN CURRENT_DATE() <= DATE('2025-12-31');` + +Note: The semicolon must be the **last character** on the RETURN line. Do NOT add inline comments after it (e.g., `RETURN TRUE; -- comment` breaks automated deployment). + +If a row filter needs user-specific metadata (e.g. the current user's region), +define a helper function in the same SQL file **before** the filter that calls it. +For example, define `get_current_user_metadata(key STRING) RETURNS STRING` that +queries a `user_metadata` table or returns a stub `CAST(NULL AS STRING)`, then +reference it from the filter. + +These are common patterns. If the user's data requires masking not covered above (e.g., vehicle VINs, student IDs, device serial numbers, product SKUs), create a new function following the same pattern (NULL-safe CASE expression, COMMENT describing usage). + +### Output Format — File 1: `masking_functions.sql` + +Group functions by target schema. Only create each function in the schema(s) where +it is referenced by `function_schema` in fgac_policies. If a function is used by +policies targeting multiple schemas, include it in each schema that needs it. + +**CRITICAL — SQL formatting rules:** +- Each function MUST end with a semicolon (`;`) as the **last character on that line** +- Do NOT put inline comments after the semicolon (e.g., `RETURN TRUE; -- comment` will break parsing) +- Put comments on separate lines above the function or in the COMMENT clause + +```sql +-- === schema_a functions === +USE CATALOG my_catalog; +USE SCHEMA schema_a; + +CREATE OR REPLACE FUNCTION mask_diagnosis_code(code STRING) +RETURNS STRING +COMMENT 'description' +RETURN CASE ... END; + +-- Row filter — semicolon must be the last char on the RETURN line +CREATE OR REPLACE FUNCTION filter_by_region_us() +RETURNS BOOLEAN +COMMENT 'Filters rows to show only US region data' +RETURN TRUE; + +-- === schema_b functions === +USE CATALOG my_catalog; +USE SCHEMA schema_b; + +CREATE OR REPLACE FUNCTION mask_credit_card_full(card_number STRING) +RETURNS STRING +COMMENT 'description' +RETURN CASE ... END; +``` + +Only include functions the user actually needs. If a library function works as-is, still include it so the user has a self-contained SQL file. + +### Output Format — File 2: `abac.auto.tfvars` + +```hcl +groups = { + "GroupName" = { description = "What this group can see" } +} + +tag_policies = [ + { key = "tag_name", description = "...", values = ["val1", "val2"] }, +] + +# entity_name: always use fully qualified names (catalog.schema.table for tables, +# catalog.schema.table.column for columns). +tag_assignments = [ + # Table-level tags (optional — scope column masks or row filters to specific tables, or for governance): + # { entity_type = "tables", entity_name = "catalog.schema.Table", tag_key = "tag_name", tag_value = "val1" }, + { entity_type = "columns", entity_name = "catalog.schema.Table.Column", tag_key = "tag_name", tag_value = "val1" }, +] + +fgac_policies = [ + # Column mask (when_condition is optional — omit to apply to all tables): + { + name = "policy_name" + policy_type = "POLICY_TYPE_COLUMN_MASK" + catalog = "my_catalog" + to_principals = ["GroupName"] + comment = "Description" + match_condition = "hasTagValue('tag_name', 'val1')" + match_alias = "alias" + function_name = "function_name" + function_catalog = "my_catalog" + function_schema = "my_schema" + }, + # Row filter (when_condition is optional — omit to apply to all tables): + { + name = "filter_name" + policy_type = "POLICY_TYPE_ROW_FILTER" + catalog = "my_catalog" + to_principals = ["GroupName"] + comment = "Description" + when_condition = "hasTagValue('tag_name', 'val1')" + function_name = "filter_function" + function_catalog = "my_catalog" + function_schema = "my_schema" + }, +] + +# when_condition is OPTIONAL for both column masks and row filters: +# - Column masks: omit when_condition to let match_condition (in match_columns) select +# columns across ALL tables. Or set when_condition (e.g. "hasTag('tag_name')") to +# scope the mask to specific tagged tables only. +# - Row filters: omit when_condition to apply to all tables, or provide it to scope +# to specific tagged tables. +# - If you use when_condition, the referenced tags must be assigned at the TABLE level +# (entity_type = "tables" in tag_assignments). + +group_members = {} +``` + +### Validation + +After generating both files, the user should validate them before running `terraform apply`: + +```bash +pip install python-hcl2 +python validate_abac.py abac.auto.tfvars masking_functions.sql +``` + +This checks cross-references (groups, tags, functions), naming conventions, and structure. Fix any `[FAIL]` errors before proceeding. + +### CRITICAL — Valid Condition Syntax + +The `match_condition` and `when_condition` fields ONLY support these functions: + +- `hasTagValue('tag_key', 'tag_value')` — matches entities with a specific tag value +- `hasTag('tag_key')` — matches entities that have the tag (any value) +- Combine with `AND` / `OR` + +**FORBIDDEN** — the following will cause compilation errors: +- `columnName() = '...'` — NOT supported +- `columnName() IN (...)` — NOT supported +- `tableName() = '...'` — NOT supported +- Any comparison operators (`=`, `!=`, `<`, `>`, `IN`) + +To target specific columns, use **distinct tag values** assigned to those columns, not `columnName()`. For example, instead of `hasTagValue('phi_level', 'full_phi') AND columnName() = 'MRN'`, create a separate tag value like `phi_level = 'mrn_restricted'` and assign it only to the MRN column. + +### CRITICAL — One Mask Per Column Per Group + +Each column must be matched by **at most one** column mask policy per principal group. If two policies with the same `to_principals` both match a column, Databricks will reject the query with `MULTIPLE_MASKS`. This means: + +1. **No overlapping match conditions**: If two column mask policies target the same group and their `match_condition` values both evaluate to true for any column, you'll get a conflict. For example, `hasTagValue('phi_level', 'masked_phi')` and `hasTagValue('phi_level', 'masked_phi') AND hasTag('phi_level')` are logically identical — the `AND hasTag(...)` is always true when `hasTagValue(...)` already matches — so both policies would apply to the same columns. + +2. **One tag value = one masking function**: Every column mask policy has a `match_condition` that selects columns by tag value, and ALL columns matching that value get the SAME masking function. You cannot use `columnName()` to differentiate — it is not supported. Therefore, if columns need different masking functions, they MUST have different tag values, even if they belong to the same sensitivity category. + + **Common mistake (WRONG):** Tagging FirstName, Email, and AccountID all as `pii_level = 'masked'`, then creating three separate policies — `mask_pii_partial`, `mask_email`, and `mask_account_number` — each matching `hasTagValue('pii_level', 'masked')`. This causes all three masks to apply to all three columns. + + **Correct approach:** Use distinct tag values per masking need: + - FirstName, LastName → `pii_level = 'masked'` → policy uses `mask_pii_partial` + - Email → `pii_level = 'masked_email'` → policy uses `mask_email` + - AccountID → `pii_level = 'masked_account'` → policy uses `mask_account_number` + + Remember to add all new tag values to the `tag_policies` `values` list. + +3. **Quick check**: For every pair of column mask policies that share any group in `to_principals`, verify that their `match_condition` values cannot both be true for the same column. If they can, either merge the policies or split the tag values. The number of distinct tag values in `tag_policies` should be >= the number of distinct masking functions you want to apply for that tag key. + +### CRITICAL — Internal Consistency + +Every tag value used in `tag_assignments` and in `match_condition` / `when_condition` MUST be defined in `tag_policies`. Before generating, cross-check: + +1. Every `tag_value` in `tag_assignments` must appear in the `values` list of the corresponding `tag_key` in `tag_policies` +2. Every `hasTagValue('key', 'value')` in `match_condition` or `when_condition` must reference a `key` and `value` that exist in `tag_policies` +3. Every `function_name` in `fgac_policies` must have a corresponding `CREATE OR REPLACE FUNCTION` in `masking_functions.sql` +4. Every group in `to_principals` / `except_principals` must be defined in `groups` +5. If any generated function calls another non-built-in function (e.g. a helper like `get_current_user_metadata`), that helper MUST also be defined in `masking_functions.sql` **before** the function that calls it. Never reference undefined functions + +Violating any of these causes validation failures. Double-check consistency across all three sections (`tag_policies`, `tag_assignments`, `fgac_policies`) before outputting. + +**Common mistake 1 — cross-key value leakage**: Do NOT use a value from one tag policy in a different tag policy. For example, if `pii_level` has value `"masked"` but `compliance_level` does not, you MUST NOT write `tag_key = "compliance_level", tag_value = "masked"`. Each tag assignment and condition must use only the values defined for that specific tag key. + +**Common mistake 2 — generic fallback values**: Do NOT use a generic value like `"masked"` in a tag assignment or match_condition unless that exact string appears in the `values` list for that tag key. If you created distinct values (e.g., `"masked_diagnosis"`, `"masked_notes"`) for a tag policy, you MUST use one of those — not a shortened or generic form. For example, if `phi_level` has values `["public", "masked_diagnosis", "masked_notes", "restricted"]`, writing `tag_value = "masked"` will fail validation because `"masked"` is not in the list. + +**Final check before outputting**: Enumerate every unique `tag_value` across all `tag_assignments` entries and every value referenced in `hasTagValue()` calls in `match_condition` / `when_condition`. For each one, confirm it appears in the `values` list of its corresponding `tag_key` in `tag_policies`. If any value is missing, either add it to the tag policy or change the assignment/condition to use an existing value. + +### Instructions + +1. Generate `masking_functions.sql` with functions **grouped by target schema**. Use separate `USE CATALOG` / `USE SCHEMA` blocks for each schema. Only deploy each function to the schema(s) where it is referenced by `function_schema` in fgac_policies — do NOT duplicate all functions into every schema. Do NOT include `uc_catalog_name`, `uc_schema_name`, or authentication variables (databricks_account_id, etc.) in the generated abac.auto.tfvars. Every `fgac_policies` entry MUST include `catalog`, `function_catalog`, and `function_schema` — set them to the catalog/schema that each policy's table belongs to. +2. Analyze each column in the user's tables for sensitivity. Common categories include but are not limited to: + - PII (names, emails, SSN, phone, address, date of birth, national IDs) + - Financial (credit cards, account numbers, amounts, IBAN, trading data) + - Health / PHI (MRN, diagnosis codes, clinical notes, insurance IDs) + - Regional / residency (region columns that need row filtering) + - Confidential business data (proprietary scores, internal metrics, trade secrets) + - Compliance-driven fields (audit logs, access timestamps, regulatory identifiers) + Adapt to whatever domain the user's tables belong to — retail, manufacturing, education, telecom, government, etc. Do NOT limit analysis to healthcare or finance. +3. Propose groups — typically 2-5 access tiers (e.g., restricted, standard, privileged, admin) +4. Design tag policies — one per sensitivity dimension (e.g., `pii_level`, `pci_clearance`) +5. Map tags to the user's specific columns. **Use distinct tag values to differentiate columns that need different masking** — do NOT use `columnName()` in conditions. Table-level tags (entity_type = "tables") are optional — use them to scope column masks or row filters to specific tables, or for governance. **Always use fully qualified entity names** (e.g. `catalog.schema.Table` for tables, `catalog.schema.Table.Column` for columns) +6. Select masking functions from the library above (or create new ones) +7. Generate both output files. For entity names in tag_assignments, always use **fully qualified** names (`catalog.schema.table` or `catalog.schema.table.column`). For function_name in fgac_policies, use relative names only (e.g. `mask_pii`). Every fgac_policy MUST include `catalog`, `function_catalog`, and `function_schema`. **CRITICAL**: set `function_schema` to the schema where the tagged columns actually live — do NOT default all policies to the first schema. In `masking_functions.sql`, group the `CREATE FUNCTION` statements by schema with separate `USE SCHEMA` blocks. Only create each function in the schema where it is needed +8. Every `match_condition` and `when_condition` MUST only use `hasTagValue()` and/or `hasTag()` — no other functions or operators +9. Generate Genie Space config — all nine fields below. **Derive everything from the user's actual tables, columns, and domain** — do NOT copy the finance/healthcare examples below if the user's data is from a different industry. Adapt terminology, metrics, filters, and joins to whatever vertical the tables belong to (retail, manufacturing, telecom, education, logistics, etc.): + - `genie_space_title` — a concise, descriptive title reflecting the user's domain (e.g., finance → "Financial Compliance Analytics", retail → "Retail Sales & Inventory Explorer", telecom → "Network Performance Dashboard") + - `genie_space_description` — 1–2 sentence summary of what the space covers and who it's for + - `genie_sample_questions` — 5–10 natural-language questions a business user in that domain would ask (shown as conversation starters in the UI). Must reference the user's actual table/column names. + - `genie_instructions` — domain-specific guidance for the Genie LLM. **Must include business defaults** — look at status/state columns in the user's tables and define which values are the default filter (e.g., if a table has `OrderStatus` with values like 'Fulfilled'/'Cancelled'/'Pending', instruct: "default to fulfilled orders"). Also cover date conventions, metric calculations, terminology, and masking awareness relevant to the user's domain. + - `genie_benchmarks` — 3–5 benchmark questions with ground-truth SQL. **Each question must be unambiguous and self-contained** — include explicit qualifiers so the question and SQL agree on scope (e.g., "What is the average risk score for active customers?" not "What is the average customer risk score?"). Avoid questions that could reasonably be interpreted with different WHERE clauses. + - `genie_sql_filters` — default WHERE clauses derived from the user's status/state columns (e.g., active records, completed transactions, open orders). Each filter has `sql`, `display_name`, `comment`, and `instruction`. + - `genie_sql_measures` — standard aggregate metrics derived from the user's numeric columns (e.g., sums, averages, counts that are meaningful in the domain). Each measure has `alias`, `sql`, `display_name`, `comment`, and `instruction`. + - `genie_sql_expressions` — computed dimensions derived from the user's date/category columns (e.g., year extraction, bucketing, status grouping). Each expression has `alias`, `sql`, `display_name`, `comment`, and `instruction`. + - `genie_join_specs` — relationships between the user's tables based on foreign key columns (look for matching ID columns like `CustomerID`, `OrderID`, `ProductID`). Each join has `left_table`, `left_alias`, `right_table`, `right_alias`, `sql`, `comment`, and `instruction`. + +### Output Format — Genie Space Config (in `abac.auto.tfvars`) + +Include these variables alongside groups, tag_policies, etc. The example below shows a **finance/healthcare** scenario — adapt all values to match the user's actual tables and industry: + +```hcl +genie_space_title = "Financial & Clinical Analytics" +genie_space_description = "Explore transaction data, patient encounters, and compliance metrics. Designed for analysts, compliance officers, and clinical staff." + +genie_sample_questions = [ + "What is the total revenue by region for last quarter?", + "Show the top 10 active customers by transaction volume", + "Which accounts have been flagged for AML review?", + "How many patient encounters occurred last month?", + "What is the average completed transaction amount by account type?", +] + +genie_instructions = "When asked about 'customers' without a status qualifier, default to active customers (CustomerStatus = 'Active'). When asked about 'transactions' without specifying status, default to completed transactions (TransactionStatus = 'Completed'). 'Last month' means the previous calendar month (not last 30 days). Round monetary values to 2 decimal places. Patient names are masked for non-clinical roles — queries about patient counts or encounter dates are always allowed." + +genie_benchmarks = [ + { + question = "What is the total amount of completed transactions?" + sql = "SELECT SUM(Amount) as total_amount FROM catalog.schema.transactions WHERE TransactionStatus = 'Completed'" + }, + { + question = "How many patient encounters occurred last month?" + sql = "SELECT COUNT(*) FROM catalog.schema.encounters WHERE EncounterDate >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL 1 MONTH) AND EncounterDate < DATE_TRUNC('month', CURRENT_DATE)" + }, + { + question = "What is the average risk score for active customers?" + sql = "SELECT AVG(RiskScore) as avg_risk_score FROM catalog.schema.customers WHERE CustomerStatus = 'Active'" + }, +] + +genie_sql_filters = [ + { + sql = "customers.CustomerStatus = 'Active'" + display_name = "active customers" + comment = "Only include customers with Active status" + instruction = "Apply when the user asks about customers without specifying a status" + }, + { + sql = "transactions.TransactionStatus = 'Completed'" + display_name = "completed transactions" + comment = "Only include completed transactions" + instruction = "Apply when the user asks about transactions or amounts without specifying a status" + }, +] + +genie_sql_measures = [ + { + alias = "total_revenue" + sql = "SUM(transactions.Amount)" + display_name = "total revenue" + comment = "Sum of all transaction amounts" + instruction = "Use for revenue, total amount, or sales calculations" + }, + { + alias = "avg_risk_score" + sql = "AVG(customers.RiskScore)" + display_name = "average risk score" + comment = "Average AML risk score across customers" + instruction = "Use when asked about risk scores or risk averages" + }, +] + +genie_sql_expressions = [ + { + alias = "transaction_year" + sql = "YEAR(transactions.TransactionDate)" + display_name = "transaction year" + comment = "Extracts year from transaction date" + instruction = "Use for year-over-year analysis of transactions" + }, +] + +genie_join_specs = [ + { + left_table = "catalog.schema.accounts" + left_alias = "accounts" + right_table = "catalog.schema.customers" + right_alias = "customers" + sql = "accounts.CustomerID = customers.CustomerID" + comment = "Join accounts to customers on CustomerID" + instruction = "Use when you need customer details for account queries" + }, + { + left_table = "catalog.schema.transactions" + left_alias = "transactions" + right_table = "catalog.schema.accounts" + right_alias = "accounts" + sql = "transactions.AccountID = accounts.AccountID" + comment = "Join transactions to accounts on AccountID" + instruction = "Use when you need account or customer context for transactions" + }, +] +``` + +--- + +### MY TABLES (paste below) + +Tables are provided with fully qualified names (catalog.schema.table). +Derive the catalog and schema for each policy from the table's fully qualified name. + +``` +-- Table DDLs are auto-fetched and pasted here. +-- Each table is fully qualified: my_catalog.my_schema.my_table +``` diff --git a/uc-quickstart/utils/genie/aws/GENIE_SPACE_PERMISSIONS.md b/uc-quickstart/utils/genie/aws/GENIE_SPACE_PERMISSIONS.md new file mode 100644 index 00000000..ca16ce05 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/GENIE_SPACE_PERMISSIONS.md @@ -0,0 +1,71 @@ +# Permissions Required for a Genie Space + +This document lists everything that must be in place for business users (the groups defined in `abac.auto.tfvars`) to use an AI/BI Genie Space. + +## 1. Identity + +- **Business groups:** Created at account level (Terraform: `databricks_group` in `main.tf`). + Groups are defined dynamically in `abac.auto.tfvars` under the `groups` variable. +- **Workspace assignment:** Account-level groups are assigned to the workspace (Terraform: `databricks_mws_permission_assignment` with `USER` in `main.tf`). + +## 2. Entitlements (Consumer = Databricks One UI only) + +- **Consumer access:** When `workspace_consume` is the **only** entitlement for a user/group, they get the **Databricks One UI** experience (dashboards, Genie spaces, apps) and do **not** get the full workspace UI (clusters, notebooks, etc.). +- **Terraform:** `databricks_entitlements` in `main.tf` sets `workspace_consume = true` for each group. No other entitlements are set so that consumers see One UI only. + +## 3. Compute + +- **SQL warehouse:** A single SQL warehouse is used for both masking function deployment and the Genie Space. Genie embeds on this warehouse; end users do **not** need explicit **CAN USE** on the warehouse. +- **Terraform:** `warehouse.tf` handles warehouse resolution: + - `sql_warehouse_id` set in `env.auto.tfvars` -> reuses the existing warehouse (dev) + - `sql_warehouse_id` empty or omitted -> auto-creates a serverless warehouse (prod) + +## 4. Data access + +- **Unity Catalog:** At least **SELECT** (and **USE CATALOG** / **USE SCHEMA**) on all UC objects used by the Genie Space. Catalogs are auto-derived from fully-qualified table names in `tag_assignments` and `fgac_policies`. ABAC policies further restrict what each group sees at query time. +- **Terraform:** `uc_grants.tf` grants `USE_CATALOG`, `USE_SCHEMA`, and `SELECT` on all relevant catalogs to all configured groups. + +## 5. Genie Space (create + ACLs) + +- **Genie Space:** Create a Genie Space with the tables from `uc_tables` (in `env.auto.tfvars`) and grant at least **CAN VIEW** and **CAN RUN** to all groups. +- **Automation:** Terraform manages Genie Space lifecycle via `genie_space.tf`: + - **`genie_space_id` empty** (greenfield): `terraform apply` auto-creates a Genie Space from `uc_tables`, sets ACLs, and trashes the space on `terraform destroy`. + - **`genie_space_id` set** (existing): `terraform apply` only applies CAN_RUN ACLs to the existing space. + +### Auto-create mode + +Set `genie_space_id = ""` in `env.auto.tfvars` and ensure `uc_tables` is non-empty. Terraform runs `genie_space.sh create` automatically during apply. Wildcards (`catalog.schema.*`) are expanded via the UC Tables API. + +### Existing space mode + +Set `genie_space_id` to your Genie Space ID in `env.auto.tfvars`. Terraform runs `genie_space.sh set-acls` to grant CAN_RUN to all configured groups. + +### Manual script usage + +The script can also be used independently outside of Terraform: + +```bash +# Create +GENIE_GROUPS_CSV=$(terraform output -raw genie_groups_csv) \ +GENIE_TABLES_CSV="cat.schema.t1,cat.schema.t2" \ +./scripts/genie_space.sh create + +# Set ACLs only +GENIE_GROUPS_CSV=$(terraform output -raw genie_groups_csv) \ +GENIE_SPACE_OBJECT_ID= \ +./scripts/genie_space.sh set-acls + +# Trash +GENIE_ID_FILE=.genie_space_id ./scripts/genie_space.sh trash +``` + +## Summary checklist + +| Requirement | Implemented in | +|------------------------|--------------------------------------------------------------------------------| +| Groups | Terraform: `main.tf` (from `groups` in `abac.auto.tfvars`) | +| Workspace assignment | Terraform: `main.tf` | +| Consumer (One UI only) | Terraform: `main.tf` (entitlements) | +| Warehouse | Terraform: `warehouse.tf` (reuses `sql_warehouse_id` or auto-creates) | +| UC data (SELECT, etc.) | Terraform: `uc_grants.tf` (auto-derived catalogs) | +| Genie Space + ACLs | Terraform: `genie_space.tf` (auto-create or ACLs-only based on `genie_space_id`) | diff --git a/uc-quickstart/utils/genie/aws/IMPORT_EXISTING.md b/uc-quickstart/utils/genie/aws/IMPORT_EXISTING.md new file mode 100644 index 00000000..25e4bc4c --- /dev/null +++ b/uc-quickstart/utils/genie/aws/IMPORT_EXISTING.md @@ -0,0 +1,41 @@ +# Import Existing Resources (Overwrite / Adopt) + +If the warehouse, groups, or tag policies **already exist**, Terraform will fail with "already exists". Use the import script below so Terraform can adopt and overwrite them. + +## Prerequisites + +Before running the import script, ensure: + +1. `auth.auto.tfvars` is configured with valid credentials and `env.auto.tfvars` with your environment. +2. `abac.auto.tfvars` is configured with the groups and tag policies you want to import. +3. `terraform init` has been run. + +## Usage + +From **genie/aws**: + +```bash +# Import all existing resources (groups, tag policies, FGAC policies) +./scripts/import_existing.sh + +# Import only groups +./scripts/import_existing.sh --groups-only + +# Import only tag policies +./scripts/import_existing.sh --tags-only + +# Dry run — show what would be imported without running terraform import +./scripts/import_existing.sh --dry-run +``` + +The script reads group names from `abac.auto.tfvars` and tag policy keys from the same file. For each resource, it checks whether an import is needed and runs `terraform import` if the resource exists in Databricks but not in Terraform state. + +## Optional: reuse an existing warehouse + +To use an existing warehouse instead of auto-creating one, set in **env.auto.tfvars**: + +```hcl +sql_warehouse_id = "" +``` + +Terraform will skip warehouse creation and reuse this ID for masking function deployment, Genie Space, and outputs. diff --git a/uc-quickstart/utils/genie/aws/Makefile b/uc-quickstart/utils/genie/aws/Makefile new file mode 100644 index 00000000..0e14182c --- /dev/null +++ b/uc-quickstart/utils/genie/aws/Makefile @@ -0,0 +1,95 @@ +.PHONY: setup generate validate validate-generated promote plan apply sync-tags destroy clean help + +SHELL := /bin/bash +export DATABRICKS_USER_AGENT_EXTRA := genierails/0.1.0 + +help: ## Show this help + @grep -E '^[a-z_-]+:.*## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' + +setup: ## Copy example files and prompt for credentials + @echo "=== Setup ===" + @if [ ! -f auth.auto.tfvars ]; then \ + cp auth.auto.tfvars.example auth.auto.tfvars; \ + echo "Created auth.auto.tfvars — edit it with your credentials."; \ + else \ + echo "auth.auto.tfvars already exists — skipping."; \ + fi + @if [ ! -f env.auto.tfvars ]; then \ + cp env.auto.tfvars.example env.auto.tfvars; \ + echo "Created env.auto.tfvars — edit it with your tables and environment config."; \ + else \ + echo "env.auto.tfvars already exists — skipping."; \ + fi + @mkdir -p ddl generated + @echo "Created ddl/ and generated/ directories." + @echo "" + @echo "Next steps:" + @echo " 1. Edit credentials (gitignored): $$(pwd)/auth.auto.tfvars" + @echo " 2. Edit tables & environment: $$(pwd)/env.auto.tfvars" + @echo " 3. Run: make generate" + +generate: ## Run generate_abac.py to produce masking SQL + tfvars + @echo "=== Generate ABAC Config ===" + python generate_abac.py + +validate-generated: ## Validate generated/ files before copying to root + @echo "=== Validate (generated/) ===" + @if [ -f generated/masking_functions.sql ]; then \ + python validate_abac.py generated/abac.auto.tfvars generated/masking_functions.sql; \ + else \ + python validate_abac.py generated/abac.auto.tfvars; \ + fi + +validate: ## Validate root abac.auto.tfvars + masking_functions.sql + @echo "=== Validate ===" + @if [ -f masking_functions.sql ]; then \ + python validate_abac.py abac.auto.tfvars masking_functions.sql; \ + elif [ -f generated/masking_functions.sql ]; then \ + python validate_abac.py abac.auto.tfvars generated/masking_functions.sql; \ + else \ + python validate_abac.py abac.auto.tfvars; \ + fi + +promote: ## Validate generated/ and copy to root + @echo "=== Promote generated/ to root ===" + @if [ -f generated/abac.auto.tfvars ]; then \ + python validate_abac.py generated/abac.auto.tfvars generated/masking_functions.sql && \ + cp generated/abac.auto.tfvars abac.auto.tfvars && \ + cp generated/masking_functions.sql masking_functions.sql && \ + echo "Promoted generated/ files to root."; \ + else \ + echo "No generated/abac.auto.tfvars found. Run 'make generate' first."; \ + exit 1; \ + fi + +plan: ## Run terraform init + plan + @echo "=== Terraform Plan ===" + terraform init -input=false + terraform plan + +sync-tags: ## Sync tag policy values to Databricks (bypasses provider reordering bug) + @echo "=== Sync Tag Policies ===" + @python3 scripts/sync_tag_policies.py + +apply: promote sync-tags ## Validate, promote, sync tags, then terraform apply + @echo "=== Terraform Apply ===" + terraform init -input=false + @echo "--- Importing tag policies into state ---" + @python3 -c "import hcl2,sys; d=hcl2.load(open('abac.auto.tfvars')); [print(tp['key']) for tp in d.get('tag_policies',[])]" 2>/dev/null | \ + while read key; do \ + terraform state rm "databricks_tag_policy.policies[\"$$key\"]" 2>/dev/null || true; \ + terraform import "databricks_tag_policy.policies[\"$$key\"]" "$$key" 2>/dev/null || true; \ + done + @echo "--- Running terraform apply ---" + terraform apply -parallelism=1 -auto-approve + +destroy: ## Run terraform destroy (drops masking functions if sql_warehouse_id is set) + @echo "=== Terraform Destroy ===" + terraform destroy -auto-approve + +clean: ## Remove generated files, Terraform state, and .terraform/ + @echo "=== Clean ===" + rm -rf generated/abac.auto.tfvars generated/masking_functions.sql generated/generated_response.md + rm -rf .terraform *.tfstate *.tfstate.backup .terraform.lock.hcl + @echo "Cleaned generated files and Terraform state." + @echo "NOTE: auth.auto.tfvars, env.auto.tfvars, and abac.auto.tfvars were NOT removed." diff --git a/uc-quickstart/utils/genie/aws/README.md b/uc-quickstart/utils/genie/aws/README.md new file mode 100644 index 00000000..464229d6 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/README.md @@ -0,0 +1,284 @@ +# GenieRails + +Put Genie onboarding on rails — with built-in guardrails. An AI-powered Terraform quickstart that gets business users into Genie quickly and safely — ABAC governance, masking functions, and a fully configured Genie Space with AI-generated sample questions, instructions, benchmarks, SQL filters, measures, and join specs — all from three config files, no `.tf` editing required. + +## What This Quickstart Automates + +- **AI-generated ABAC config** — Point at your tables, and an LLM analyzes column sensitivity to generate groups, tag policies, tag assignments, FGAC policies, and masking functions automatically. +- **Business groups** — Create account-level groups (access tiers) and optionally manage group membership. +- **Workspace onboarding** — Assign groups to a target workspace with Databricks One consumer entitlements. +- **Data access grants** — Apply minimum Unity Catalog privileges (`USE_CATALOG`, `USE_SCHEMA`, `SELECT`) for data exposed through Genie. +- **ABAC governance** — Create governed tag policies, tag assignments on tables/columns, and FGAC policies (column masks + row filters). +- **Masking functions** — Auto-deploy SQL UDFs to enforce column-level data masking (e.g., mask SSN, redact PII, hash emails). +- **Genie Space** — Auto-create a new Genie Space from your tables, or bring an existing one. New spaces include AI-generated config: + - **Sample questions** — Conversation starters tailored to your data domain + - **Instructions** — Domain-specific LLM guidance with business defaults (e.g., "customer" means active by default) + - **Benchmarks** — Unambiguous ground-truth question + SQL pairs for evaluating Genie accuracy + - **SQL filters** — Default WHERE clauses (e.g., active customers, completed transactions) that guide Genie's SQL generation + - **SQL measures & expressions** — Standard metrics (total revenue, avg risk score) and computed dimensions (transaction year) + - **Join specs** — Table relationships with join conditions so Genie knows how to combine tables + - **Title & description** — Contextual naming based on your tables and domain + - For existing spaces, set `genie_space_id` in `env.auto.tfvars` to apply `CAN_RUN` ACLs for all configured business groups +- **SQL warehouse** — Auto-create a serverless warehouse or reuse an existing one. + +## How It Works + +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ YOU PROVIDE (one-time setup) │ +ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤ +│ │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ auth.auto.tfvars │ │ env.auto.tfvars │ │ +│ │ (secrets — gitignored) │ │ (environment — checked in) │ │ +│ │ │ │ │ │ +│ │ databricks_account_id = "..."│ │ uc_tables = ["cat.sch.*"] │ │ +│ │ databricks_client_id = "..."│ │ sql_warehouse_id = "" │ │ +│ │ databricks_client_secret │ │ genie_space_id = "" │ │ +│ │ databricks_workspace_host │ │ │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ make generate (generate_abac.py) │ +│ │ +│ 1. Fetches DDLs from Unity Catalog (via Databricks SDK) │ +│ 2. Reads ABAC_PROMPT.md + DDLs ──▶ LLM (Claude Sonnet) │ +│ │ +│ Providers: Databricks FMAPI (default) | Anthropic | OpenAI │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + ā–¼ ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ generated/ (output folder) │ +│ │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ masking_functions.sql │ │ abac.auto.tfvars │ │ +│ │ │ │ (ABAC + Genie — no credentials) │ │ +│ │ SQL UDFs: │ │ │ │ +│ │ • mask_pii_partial() │ │ groups ─ access tiers │ │ +│ │ • mask_ssn() │ │ tag_policies ─ sensitivity tags │ │ +│ │ • mask_email() │ │ tag_assignments ─ tags on cols │ │ +│ │ • filter_by_region() │ │ fgac_policies ─ masks & filters │ │ +│ │ • ... │ │ genie_space_title / description │ │ +│ │ │ │ genie_sample_questions (5–10) │ │ +│ │ │ │ genie_instructions │ │ +│ │ │ │ genie_benchmarks (3–5 w/ SQL) │ │ +│ │ │ │ genie_sql_filters / measures │ │ +│ │ │ │ genie_sql_expressions │ │ +│ │ │ │ genie_join_specs │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ ā–² TUNE & VALIDATE │ + │ │ make validate-generated + │ │ (repeat until PASS) + ā–¼ ā–¼ +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ make apply (validate → promote → terraform apply) │ +│ Loads: auth.auto.tfvars + env.auto.tfvars + abac.auto.tfvars │ +│ │ +│ Creates in Databricks: │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ Account Groups │ │ Tag Policies │ │ Tag Assignments │ │ +│ │ Analyst │ │ pii_level │ │ Customers.SSN │ │ +│ │ Manager │ │ phi_level │ │ → pii_level=masked │ │ +│ │ Compliance │ │ data_region │ │ Billing.Amount │ │ +│ │ Admin │ │ │ │ → pii_level=masked │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ FGAC Policies (Column Masks + Row Filters) │ │ +│ │ │ │ +│ │ "Analyst sees SSN as ***-**-1234" ──▶ mask_ssn() │ │ +│ │ "Manager sees notes as [REDACTED]" ──▶ mask_redact() │ │ +│ │ "US_Staff sees only US rows" ──▶ filter_by_region() │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +│ ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” │ +│ │ Masking Functions │ │ UC Grants │ │ Genie Space │ │ +│ │ (auto-deploy UDFs) │ │ USE_CATALOG │ │ • sample questions │ │ +│ │ │ │ USE_SCHEMA │ │ • instructions │ │ +│ │ + SQL Warehouse │ │ SELECT │ │ • benchmarks │ │ +│ │ (auto-created if │ │ │ │ • sql filters / │ │ +│ │ needed) │ │ │ │ measures / joins │ │ +│ │ │ │ │ │ • CAN_RUN ACLs │ │ +│ │ │ │ │ │ for all groups │ │ +│ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +## Prerequisites + +- Tables must exist in Unity Catalog before running `make generate` +- A Databricks **service principal** with the following roles: + + +| Role | Why it's needed | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| **Account Admin** | Create account-level groups, assign groups to workspace, manage group membership | +| **Workspace Admin** | Grant entitlements (`workspace_consume`), create/manage Genie Spaces and permissions | +| **Metastore Admin** | Create governed tag policies (`databricks_tag_policy`), and grant itself `USE_CATALOG`, `USE_SCHEMA`, `EXECUTE`, `MANAGE`, `CREATE_FUNCTION` on any catalog to create FGAC policies, assign tags, and deploy masking functions. Without this role, tag policies must be pre-created manually and catalog-level privileges must be granted by a catalog owner | + + +## Quick Start + +```bash +make setup # 1. Creates auth.auto.tfvars + env.auto.tfvars from examples +vi auth.auto.tfvars # Fill in credentials (gitignored) +vi env.auto.tfvars # Fill in uc_tables, sql_warehouse_id (checked in) + +make generate # 2. Fetches DDLs, calls LLM, outputs to generated/ + +make validate-generated # 3. (Optional) Tune generated/ files, validate after each edit +make apply # Validates → promotes → terraform apply +``` + +That's it. `make apply` creates groups, tags, masking functions, FGAC policies, UC grants, and a Genie Space (with AI-generated sample questions, instructions, benchmarks, SQL filters/measures/expressions, and join specs) — all in one command. + +To tear everything down: `make destroy`. + +## Configuration + +Three files, clear separation of concerns: + + +| File | What goes here | Tracked in git? | +| ------------------ | ------------------------------------------------------------------------ | --------------- | +| `auth.auto.tfvars` | Credentials only (account ID, client ID/secret, workspace) | No (secrets) | +| `env.auto.tfvars` | `uc_tables`, `sql_warehouse_id`, `genie_space_id` | **Yes** | +| `abac.auto.tfvars` | Groups, tag policies, tag assignments, FGAC policies, Genie Space config | **Yes** | + + +### `auth.auto.tfvars` — credentials (gitignored) + +```hcl +databricks_account_id = "..." +databricks_client_id = "..." +databricks_client_secret = "..." +databricks_workspace_id = "..." +databricks_workspace_host = "https://..." +``` + +### `env.auto.tfvars` — environment config (checked in) + +```hcl +uc_tables = ["catalog.schema.table1", "catalog.schema.*"] # tables for ABAC + Genie +sql_warehouse_id = "" # set to reuse existing, or leave empty to auto-create +genie_space_id = "" # set for existing space, or leave empty to auto-create +``` + +### `abac.auto.tfvars` — ABAC + Genie config (auto-generated) + +Generated by `make generate`. Contains groups, tag policies, tag assignments, FGAC policies, and Genie Space config (title, description, sample questions, instructions, benchmarks). Tune it before applying. See `generated/TUNING.md` for guidance. + +## Genie Space + +Managed automatically based on `genie_space_id` in `env.auto.tfvars`: + + +| `genie_space_id` | `uc_tables` | What happens on `make apply` | +| ---------------- | ----------- | ----------------------------------------------------------------------------------------- | +| Empty | Non-empty | Auto-creates a Genie Space from `uc_tables`, sets CAN_RUN ACLs, trashes on `make destroy` | +| Set | Any | Applies CAN_RUN ACLs to the existing space | +| Empty | Empty | No Genie Space action | + + +When `make generate` creates the ABAC config, it also generates Genie Space config in `abac.auto.tfvars`: + + +| Variable | Purpose | +| ------------------------- | -------------------------------------------------------------------------------------------------------- | +| `genie_space_title` | AI-generated title for the Genie Space (e.g., "Financial Compliance Analytics") | +| `genie_space_description` | 1–2 sentence summary of the space's scope and audience | +| `genie_sample_questions` | Natural-language questions shown as conversation starters in the Genie UI | +| `genie_instructions` | Domain-specific guidance including business defaults (e.g., "customer" = active by default) | +| `genie_benchmarks` | Unambiguous ground-truth question + SQL pairs for evaluating Genie accuracy | +| `genie_sql_filters` | Default WHERE clauses (e.g., active customers, completed transactions) that guide Genie's SQL generation | +| `genie_sql_measures` | Standard aggregate metrics (e.g., total revenue, average risk score) | +| `genie_sql_expressions` | Computed dimensions (e.g., transaction year, age bucket) | +| `genie_join_specs` | Table relationships with join conditions (e.g., accounts to customers on CustomerID) | + + +All nine fields are included in the `serialized_space` when a new Genie Space is created. Review and tune them in `generated/abac.auto.tfvars` alongside the ABAC policies before applying. + +## Make Targets + + +| Target | Description | +| ------------------------- | ---------------------------------------------------------------- | +| `make setup` | Copy example files, create `ddl/` and `generated/` directories | +| `make generate` | Run `generate_abac.py` to produce masking SQL + tfvars | +| `make validate-generated` | Validate `generated/` files (run after each tuning edit) | +| `make validate` | Validate root `abac.auto.tfvars` + `masking_functions.sql` | +| `make promote` | Validate `generated/` and copy to module root | +| `make plan` | `terraform init` + `terraform plan` | +| `make apply` | Validate, promote, then `terraform apply` | +| `make destroy` | `terraform destroy` (cleans up everything including Genie Space) | +| `make clean` | Remove generated files, Terraform state, and `.terraform/` | + + +## Importing Existing Resources + +If groups, tag policies, or FGAC policies already exist in Databricks, `terraform apply` will fail with "already exists". Import them first: + +```bash +./scripts/import_existing.sh # import all resource types +./scripts/import_existing.sh --dry-run # preview without importing +./scripts/import_existing.sh --groups-only # import only groups +./scripts/import_existing.sh --tags-only # import only tag policies +./scripts/import_existing.sh --fgac-only # import only FGAC policies +``` + +See `[IMPORT_EXISTING.md](IMPORT_EXISTING.md)` for details. + +## Troubleshooting + +### "Provider produced inconsistent result after apply" (tag policies) + +A known Databricks provider bug — the API reorders tag policy values after creation, causing a state mismatch. **Your tag policies are created correctly**; only the Terraform state comparison fails. + +`make apply` prevents this entirely via three mechanisms: `make sync-tags` updates values directly through the Databricks SDK (bypassing Terraform), all tag policies are reimported before apply to sync state with the API's ordering, and `ignore_changes = [values]` in `tag_policies.tf` prevents Terraform from attempting value reordering. You should not see this error when using `make apply`. + +If you run `terraform apply` directly (bypassing the Makefile) and hit this error, use `make apply` instead. If you need to recover manually: + +```bash +# Remove and reimport all tag policies to sync state +python3 -c "import hcl2,sys; d=hcl2.load(open('abac.auto.tfvars')); [print(tp['key']) for tp in d.get('tag_policies',[])]" | \ + while read key; do + terraform state rm "databricks_tag_policy.policies[\"$key\"]" 2>/dev/null || true + terraform import "databricks_tag_policy.policies[\"$key\"]" "$key" + done +terraform apply -parallelism=1 -auto-approve +``` + +### "already exists" + +Resources (groups, tag policies) already exist in Databricks. Import them so Terraform can manage them: + +```bash +./scripts/import_existing.sh +``` + +## Advanced Usage + +### Generation options + +```bash +python generate_abac.py --tables "a.b.*" "c.d.e" # override uc_tables +python generate_abac.py --dry-run # preview prompt without calling LLM +``` + +### Examples + +A pre-built finance demo is available in `examples/finance/` — copy the tfvars and SQL files to try without AI generation. Sample healthcare DDLs are in `examples/healthcare/ddl/` for testing `make generate`. + +## Roadmap + +- Unity Catalog metrics in Genie +- Multi Genie Space support +- Multi data steward / user support +- AI-assisted tuning and troubleshooting +- Auto-detect and import existing policies +- Import existing groups + diff --git a/uc-quickstart/utils/genie/aws/abac.auto.tfvars.example b/uc-quickstart/utils/genie/aws/abac.auto.tfvars.example new file mode 100644 index 00000000..a04c359a --- /dev/null +++ b/uc-quickstart/utils/genie/aws/abac.auto.tfvars.example @@ -0,0 +1,144 @@ +# ============================================================================ +# ABAC Terraform Module — Variable Skeleton (ABAC + Genie config) +# ============================================================================ +# This file contains the ABAC configuration (groups, tags, policies) +# and Genie Space config (title, description, questions, instructions, +# benchmarks). Credentials go in auth.auto.tfvars; environment in env.auto.tfvars. +# +# Setup: +# 1. cp auth.auto.tfvars.example auth.auto.tfvars (credentials — gitignored) +# cp env.auto.tfvars.example env.auto.tfvars (tables + environment) +# 2. cp abac.auto.tfvars.example abac.auto.tfvars (fill in ABAC config) +# 3. terraform apply (loads both files automatically) +# +# For a complete working example see examples/finance/finance.tfvars.example. +# ============================================================================ + +# === Groups: one entry per access tier === +# Each key becomes a Databricks account-level group with consumer entitlements. +groups = { + # "GroupName" = { description = "What this group can see" } +} + +# === Tag policies: governance tags for ABAC matching === +# Each entry creates a databricks_tag_policy with the specified allowed values. +tag_policies = [ + # { key = "sensitivity", description = "Data sensitivity level", values = ["public", "confidential", "restricted"] } +] + +# === Tag assignments: bind tags to your tables/columns === +# entity_type: "tables" or "columns" +# entity_name must be fully qualified: +# Tables: "catalog.schema.TableName" +# Columns: "catalog.schema.TableName.ColumnName" +tag_assignments = [ + # { entity_type = "columns", entity_name = "my_catalog.my_schema.Table.Column", tag_key = "sensitivity", tag_value = "confidential" } +] + +# === FGAC policies: the access rules === +# policy_type: POLICY_TYPE_COLUMN_MASK or POLICY_TYPE_ROW_FILTER +# catalog, function_catalog, function_schema are REQUIRED on each policy. +# function_name is relative (just the function name, e.g. "mask_redact"). +fgac_policies = [ + # Column mask example: + # { + # name = "mask_confidential" + # policy_type = "POLICY_TYPE_COLUMN_MASK" + # catalog = "my_catalog" + # to_principals = ["Restricted_Users"] + # comment = "Mask confidential columns" + # match_condition = "hasTagValue('sensitivity', 'confidential')" + # match_alias = "cols" + # function_name = "mask_redact" + # function_catalog = "my_catalog" + # function_schema = "my_schema" + # } + # + # Row filter example: + # { + # name = "region_filter" + # policy_type = "POLICY_TYPE_ROW_FILTER" + # catalog = "my_catalog" + # to_principals = ["EU_Staff"] + # comment = "EU staff see EU data only" + # when_condition = "hasTagValue('data_region', 'scoped')" + # function_name = "filter_by_region_eu" + # function_catalog = "my_catalog" + # function_schema = "my_schema" + # } +] + +# === Group members (optional): account-level user IDs to add to groups === +# Get IDs from Account Console > Users or SCIM API. +group_members = { + # "GroupName" = ["user_id_1", "user_id_2"] +} + +# === Genie Space Config (AI-generated — tune as needed) === +# Title and description for the auto-created Genie Space. +# genie_space_title = "Financial & Clinical Analytics" +# genie_space_description = "Explore transaction data, patient encounters, and compliance metrics with natural language." + +# Sample questions shown as conversation starters in the Genie Space UI. +# genie_sample_questions = [ +# "What is the total revenue by region for last quarter?", +# "Show the top 10 customers by transaction volume", +# "Which accounts have been flagged for AML review?", +# ] + +# Domain-specific guidance for the Genie LLM (include business defaults). +# genie_instructions = "When asked about 'customers' without a status qualifier, default to active customers (CustomerStatus = 'Active'). 'Last month' means the previous calendar month. Round monetary values to 2 decimal places." + +# Ground-truth SQL for evaluating Genie accuracy. +# Each question must be unambiguous — include explicit qualifiers so question and SQL agree on scope. +# genie_benchmarks = [ +# { +# question = "What is the total amount of completed transactions?" +# sql = "SELECT SUM(Amount) as total_amount FROM catalog.schema.transactions WHERE TransactionStatus = 'Completed'" +# }, +# ] + +# Default WHERE clauses that guide Genie's SQL generation. +# genie_sql_filters = [ +# { +# sql = "customers.CustomerStatus = 'Active'" +# display_name = "active customers" +# comment = "Only include customers with Active status" +# instruction = "Apply when the user asks about customers without specifying a status" +# }, +# ] + +# Standard aggregate metrics for Genie to use. +# genie_sql_measures = [ +# { +# alias = "total_revenue" +# sql = "SUM(transactions.Amount)" +# display_name = "total revenue" +# comment = "Sum of all transaction amounts" +# instruction = "Use for revenue, total amount, or sales calculations" +# }, +# ] + +# Computed dimensions/columns for Genie to use. +# genie_sql_expressions = [ +# { +# alias = "transaction_year" +# sql = "YEAR(transactions.TransactionDate)" +# display_name = "transaction year" +# comment = "Extracts year from transaction date" +# instruction = "Use for year-over-year analysis of transactions" +# }, +# ] + +# Join relationships between tables. +# genie_join_specs = [ +# { +# left_table = "catalog.schema.accounts" +# left_alias = "accounts" +# right_table = "catalog.schema.customers" +# right_alias = "customers" +# sql = "accounts.CustomerID = customers.CustomerID" +# comment = "Join accounts to customers on CustomerID" +# instruction = "Use when you need customer details for account queries" +# }, +# ] diff --git a/uc-quickstart/utils/genie/aws/auth.auto.tfvars.example b/uc-quickstart/utils/genie/aws/auth.auto.tfvars.example new file mode 100644 index 00000000..9851969c --- /dev/null +++ b/uc-quickstart/utils/genie/aws/auth.auto.tfvars.example @@ -0,0 +1,11 @@ +# Databricks Authentication — secrets only. +# Terraform auto-loads *.auto.tfvars — no need to pass -var-file. +# This file is gitignored. NEVER check it in. +# +# cp auth.auto.tfvars.example auth.auto.tfvars + +databricks_account_id = "" +databricks_client_id = "" +databricks_client_secret = "" +databricks_workspace_id = "" +databricks_workspace_host = "" diff --git a/uc-quickstart/utils/genie/aws/ddl/README.md b/uc-quickstart/utils/genie/aws/ddl/README.md new file mode 100644 index 00000000..fb35fb75 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/ddl/README.md @@ -0,0 +1,14 @@ +# DDL Input Folder + +Place your `CREATE TABLE` DDL files here (`.sql`). The `generate_abac.py` script reads all `.sql` files from this folder. + +**Supports:** +- A single file with multiple `CREATE TABLE` statements +- One file per table (recommended for clarity) + +**Example — using the healthcare sample DDLs:** + +```bash +cp examples/healthcare/ddl/*.sql ddl/ +python generate_abac.py --catalog my_catalog --schema my_schema +``` diff --git a/uc-quickstart/utils/genie/aws/deploy_masking_functions.py b/uc-quickstart/utils/genie/aws/deploy_masking_functions.py new file mode 100644 index 00000000..fc7470c2 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/deploy_masking_functions.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""Deploy or drop masking functions via Databricks Statement Execution API. + +Called by Terraform (null_resource + local-exec) during apply and destroy. +Auth is read from environment variables set by the provisioner: + DATABRICKS_HOST, DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET + +Usage: + python3 deploy_masking_functions.py \ + --sql-file masking_functions.sql --warehouse-id + python3 deploy_masking_functions.py \ + --sql-file masking_functions.sql --warehouse-id --drop +""" + +import argparse +import re +import subprocess +import sys + +PRODUCT_NAME = "genierails" +PRODUCT_VERSION = "0.1.0" + +REQUIRED_PACKAGES = {"databricks-sdk": "databricks.sdk"} + + +def _ensure_packages(): + missing = [] + for pip_name, import_name in REQUIRED_PACKAGES.items(): + try: + __import__(import_name) + except ImportError: + missing.append(pip_name) + if missing: + print(f" Installing missing packages: {', '.join(missing)}...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "--quiet", *missing], + ) + try: + __import__("databricks.sdk.useragent") + except (ImportError, ModuleNotFoundError): + print(" Upgrading databricks-sdk (need databricks.sdk.useragent)...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "--quiet", "--upgrade", "databricks-sdk"], + ) + + +_ensure_packages() + +from databricks.sdk import WorkspaceClient # noqa: E402 +from databricks.sdk.service.sql import ( # noqa: E402 + StatementState, +) + + +def parse_sql_blocks(sql_text: str) -> list: + """Parse a SQL file into (catalog, schema, statement) tuples. + + Tracks USE CATALOG / USE SCHEMA directives to determine the execution + context for each CREATE statement. + """ + catalog, schema = None, None + blocks = [] + + for raw_stmt in re.split(r";\s*(?:--[^\n]*)?\n", sql_text): + lines = [l for l in raw_stmt.split("\n") + if l.strip() and not l.strip().startswith("--")] + stmt = "\n".join(lines).strip() + if not stmt: + continue + + m = re.match(r"USE\s+CATALOG\s+(\S+)", stmt, re.IGNORECASE) + if m: + catalog = m.group(1) + continue + + m = re.match(r"USE\s+SCHEMA\s+(\S+)", stmt, re.IGNORECASE) + if m: + schema = m.group(1) + continue + + if stmt.upper().startswith("CREATE"): + blocks.append((catalog, schema, stmt)) + + return blocks + + +def extract_function_name(stmt: str) -> str: + """Extract function name from a CREATE FUNCTION statement.""" + m = re.search( + r"FUNCTION\s+(\S+)\s*\(", stmt, re.IGNORECASE + ) + return m.group(1) if m else "" + + +def deploy(sql_file: str, warehouse_id: str) -> None: + w = WorkspaceClient(product=PRODUCT_NAME, product_version=PRODUCT_VERSION) + + with open(sql_file) as f: + sql_text = f.read() + + blocks = parse_sql_blocks(sql_text) + if not blocks: + print(" No CREATE statements found in SQL file — nothing to deploy.") + return + + total = len(blocks) + print(f" Deploying {total} function(s) via Statement Execution API...") + + failed = 0 + for i, (catalog, schema, stmt) in enumerate(blocks, 1): + func_name = extract_function_name(stmt) + target = f"{catalog}.{schema}" if catalog and schema else "" + print(f" [{i}/{total}] {target}.{func_name} ...", end=" ", flush=True) + + try: + resp = w.statement_execution.execute_statement( + warehouse_id=warehouse_id, + statement=stmt, + catalog=catalog, + schema=schema, + wait_timeout="30s", + ) + except Exception as e: + print(f"ERROR: {e}") + failed += 1 + continue + + state = resp.status.state + if state == StatementState.SUCCEEDED: + print("OK") + else: + error_msg = "" + if resp.status.error: + error_msg = resp.status.error.message or str(resp.status.error) + print(f"FAILED ({state.value}): {error_msg}") + failed += 1 + + print() + if failed: + print(f" {failed}/{total} statement(s) failed.") + sys.exit(1) + else: + print(f" All {total} function(s) deployed successfully.") + + +def drop(sql_file: str, warehouse_id: str) -> None: + w = WorkspaceClient(product=PRODUCT_NAME, product_version=PRODUCT_VERSION) + + with open(sql_file) as f: + sql_text = f.read() + + blocks = parse_sql_blocks(sql_text) + if not blocks: + print(" No functions found in SQL file — nothing to drop.") + return + + total = len(blocks) + print(f" Dropping {total} function(s) via Statement Execution API...") + + failed = 0 + for i, (catalog, schema, stmt) in enumerate(blocks, 1): + func_name = extract_function_name(stmt) + fqn = f"{catalog}.{schema}.{func_name}" if catalog and schema else func_name + target = f"{catalog}.{schema}" if catalog and schema else "" + print(f" [{i}/{total}] DROP {target}.{func_name} ...", end=" ", flush=True) + + drop_stmt = f"DROP FUNCTION IF EXISTS {fqn}" + try: + resp = w.statement_execution.execute_statement( + warehouse_id=warehouse_id, + statement=drop_stmt, + catalog=catalog, + schema=schema, + wait_timeout="30s", + ) + except Exception as e: + print(f"ERROR: {e}") + failed += 1 + continue + + state = resp.status.state + if state == StatementState.SUCCEEDED: + print("OK") + else: + error_msg = "" + if resp.status.error: + error_msg = resp.status.error.message or str(resp.status.error) + print(f"FAILED ({state.value}): {error_msg}") + failed += 1 + + print() + if failed: + print(f" {failed}/{total} drop(s) failed.") + sys.exit(1) + else: + print(f" All {total} function(s) dropped successfully.") + + +def main(): + parser = argparse.ArgumentParser( + description="Deploy or drop masking functions via " + "Databricks Statement Execution API" + ) + parser.add_argument( + "--sql-file", + required=True, + help="Path to masking_functions.sql", + ) + parser.add_argument( + "--warehouse-id", + required=True, + help="SQL warehouse ID for statement execution", + ) + parser.add_argument( + "--drop", + action="store_true", + help="Drop functions instead of creating them (used during terraform destroy)", + ) + args = parser.parse_args() + + if args.drop: + drop(args.sql_file, args.warehouse_id) + else: + deploy(args.sql_file, args.warehouse_id) + + +if __name__ == "__main__": + main() diff --git a/uc-quickstart/utils/genie/aws/entity_tag_assignments.tf b/uc-quickstart/utils/genie/aws/entity_tag_assignments.tf new file mode 100644 index 00000000..134049a6 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/entity_tag_assignments.tf @@ -0,0 +1,26 @@ +# ============================================================================ +# Entity Tag Assignments (data-driven) +# ============================================================================ +# Applies governed tags to tables and columns from var.tag_assignments. +# entity_name must be fully qualified (catalog.schema.table for tables, +# catalog.schema.table.column for columns). +# ============================================================================ + +locals { + tag_assignment_map = { + for ta in var.tag_assignments : + "${ta.entity_type}|${ta.entity_name}|${ta.tag_key}|${ta.tag_value}" => ta + } +} + +resource "databricks_entity_tag_assignment" "assignments" { + for_each = local.tag_assignment_map + + provider = databricks.workspace + entity_type = each.value.entity_type + entity_name = each.value.entity_name + tag_key = each.value.tag_key + tag_value = each.value.tag_value + + depends_on = [databricks_tag_policy.policies] +} diff --git a/uc-quickstart/utils/genie/aws/env.auto.tfvars.example b/uc-quickstart/utils/genie/aws/env.auto.tfvars.example new file mode 100644 index 00000000..f2f2d33e --- /dev/null +++ b/uc-quickstart/utils/genie/aws/env.auto.tfvars.example @@ -0,0 +1,22 @@ +# Environment Config — tables, warehouse, and Genie Space settings. +# Terraform auto-loads *.auto.tfvars — no need to pass -var-file. +# This file is safe to check into Git (no secrets). +# +# cp env.auto.tfvars.example env.auto.tfvars + +# Tables to generate ABAC policies for (fully qualified: catalog.schema.table). +# Use catalog.schema.* to include all tables in a schema. +# Example: +# uc_tables = ["prod.sales.customers", "prod.sales.orders", "dev.finance.*"] +uc_tables = [] + +# SQL warehouse ID (shared by masking function deployment + Genie Space). +# Set to reuse an existing warehouse (dev). Leave empty to auto-create a +# serverless warehouse (prod/greenfield). +# Find warehouse IDs: Databricks workspace > SQL Warehouses > select warehouse > copy ID +sql_warehouse_id = "" + +# Genie Space ID. Set to apply ACLs to an existing space. +# Leave empty to auto-create a new Genie Space from uc_tables on apply. +# Find space ID: open the Genie Space in Databricks UI > copy ID from the URL. +genie_space_id = "" diff --git a/uc-quickstart/utils/genie/aws/examples/finance/0.1finance_abac_functions.sql b/uc-quickstart/utils/genie/aws/examples/finance/0.1finance_abac_functions.sql new file mode 100644 index 00000000..4af9c48c --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/finance/0.1finance_abac_functions.sql @@ -0,0 +1,260 @@ +-- ============================================= +-- DATABRICKS UNITY CATALOG ABAC MASKING FUNCTIONS - FINANCE DOMAIN +-- Purpose: Attribute-Based Access Control (ABAC) utility functions for financial services data masking +-- Compliance: PCI-DSS, AML/KYC, GDPR, SOX, GLBA +-- Reference: https://docs.databricks.com/aws/en/data-governance/unity-catalog/abac/ +-- ============================================= + +-- Set catalog and schema context +USE CATALOG fincat; +USE SCHEMA finance; + +-- ============================================= +-- MASKING FUNCTIONS (11 total) +-- These transform/hide data values while preserving table structure +-- ============================================= + +-- ============================================= +-- 1. CREDIT CARD FULL MASKING FUNCTION +-- Purpose: Complete masking of credit card numbers for PCI-DSS compliance +-- Usage: Customer service representatives with basic clearance +-- Input: Credit card number (e.g., 4532-1234-5678-9010) +-- Output: Fully masked (XXXX-XXXX-XXXX-XXXX) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_credit_card_full(card_number STRING) +RETURNS STRING +COMMENT 'ABAC utility: Full credit card masking for PCI-DSS compliance' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- ============================================= +-- 2. CREDIT CARD LAST 4 DIGITS FUNCTION +-- Purpose: Show only last 4 digits for customer service verification +-- Usage: Customer service and fraud detection teams +-- Input: Credit card number (e.g., 4532-1234-5678-9010) +-- Output: Masked with last 4 visible (XXXX-XXXX-XXXX-9010) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_credit_card_last4(card_number STRING) +RETURNS STRING +COMMENT 'ABAC utility: Show last 4 digits of credit card for verification' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + WHEN LENGTH(REGEXP_REPLACE(card_number, '[^0-9]', '')) >= 4 THEN + CONCAT('XXXX-XXXX-XXXX-', RIGHT(REGEXP_REPLACE(card_number, '[^0-9]', ''), 4)) + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- ============================================= +-- 3. SSN MASKING FUNCTION +-- Purpose: Mask Social Security Numbers while showing last 4 for verification +-- Usage: Customer service and compliance teams +-- Input: SSN (e.g., 123-45-6789) +-- Output: Masked SSN (XXX-XX-6789) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_ssn(ssn STRING) +RETURNS STRING +COMMENT 'ABAC utility: Mask SSN showing only last 4 digits for GLBA compliance' +RETURN CASE + WHEN ssn IS NULL OR ssn = '' THEN ssn + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) = 9 THEN + CONCAT('XXX-XX-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE 'XXX-XX-XXXX' +END; + +-- ============================================= +-- 4. ACCOUNT NUMBER TOKENIZATION FUNCTION +-- Purpose: Deterministic masking of account numbers for analytics +-- Usage: Data analysts and reporting teams +-- Input: Account number (e.g., ACC123456) +-- Output: Deterministic token (e.g., ACCT_a3f9c2...) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_account_number(account_id STRING) +RETURNS STRING +COMMENT 'ABAC utility: Deterministic account number tokenization for cross-table analytics' +RETURN CASE + WHEN account_id IS NULL OR account_id = '' THEN account_id + ELSE CONCAT('ACCT_', LEFT(SHA2(account_id, 256), 12)) +END; + +-- ============================================= +-- 5. EMAIL MASKING FOR FINANCE FUNCTION +-- Purpose: Mask customer email addresses for privacy +-- Usage: Marketing and customer service teams +-- Input: Email (e.g., john.doe@example.com) +-- Output: Masked email (****@example.com) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_email_finance(email STRING) +RETURNS STRING +COMMENT 'ABAC utility: Mask email local part while preserving domain for GDPR compliance' +RETURN CASE + WHEN email IS NULL OR email = '' THEN email + WHEN LOCATE('@', email) > 0 THEN + CONCAT('****', SUBSTRING(email, LOCATE('@', email))) + ELSE '****' +END; + +-- ============================================= +-- 6. CUSTOMER ID DETERMINISTIC MASKING FUNCTION +-- Purpose: Hash customer IDs for referential integrity in analytics +-- Usage: Data scientists and analysts performing cross-table joins +-- Input: Customer ID (e.g., CUST00123) +-- Output: Deterministic reference (e.g., REF_c8a9f...) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_customer_id_deterministic(customer_id STRING) +RETURNS STRING +COMMENT 'ABAC utility: Deterministic customer ID masking preserving join capability' +RETURN CASE + WHEN customer_id IS NULL OR customer_id = '' THEN customer_id + ELSE CONCAT('REF_', LEFT(SHA2(customer_id, 256), 10)) +END; + +-- ============================================= +-- 7. TRANSACTION AMOUNT ROUNDING FUNCTION +-- Purpose: Round transaction amounts for aggregated reporting +-- Usage: Marketing teams and external partners +-- Input: Amount (e.g., 1234.56) +-- Output: Rounded amount (1200.00) +-- ============================================= +CREATE OR REPLACE FUNCTION mask_amount_rounded(amount DECIMAL(18,2)) +RETURNS DECIMAL(18,2) +COMMENT 'ABAC utility: Round amounts to nearest hundred for aggregated analytics' +RETURN CASE + WHEN amount IS NULL THEN NULL + WHEN amount < 100 THEN ROUND(amount, -1) -- Round to nearest 10 + ELSE ROUND(amount, -2) -- Round to nearest 100 +END; + +-- ============================================= +-- 8. PII STRING PARTIAL MASKING FUNCTION +-- Purpose: Show only first and last characters of PII fields +-- Usage: Customer names and addresses for partial visibility +-- Input: String value (e.g., "John") +-- Output: Partially masked string (e.g., "J**n") +-- ============================================= +CREATE OR REPLACE FUNCTION mask_pii_partial(input STRING) +RETURNS STRING +COMMENT 'ABAC utility: Partial PII masking showing first and last characters for GDPR' +RETURN CASE + WHEN input IS NULL OR input = '' THEN input + WHEN LENGTH(input) <= 2 THEN REPEAT('*', LENGTH(input)) + WHEN LENGTH(input) = 3 THEN CONCAT(LEFT(input, 1), '*', RIGHT(input, 1)) + ELSE CONCAT(LEFT(input, 1), REPEAT('*', LENGTH(input) - 2), RIGHT(input, 1)) +END; + +-- ============================================= +-- ROW FILTER FUNCTIONS (Zero-argument for Unity Catalog ABAC) +-- These control which rows are visible to users based on group membership +-- Note: UC ROW FILTER policies require 0-argument functions +-- ============================================= + +-- ============================================= +-- 9. TRADING HOURS TIME-BASED FILTER +-- Purpose: Restrict access to trading positions during market hours +-- Usage: Prevent risk managers from accessing live positions during trading +-- Input: None (uses current time) +-- Output: Boolean indicating if access is allowed (outside trading hours 9:30 AM - 4:00 PM ET) +-- ============================================= +CREATE OR REPLACE FUNCTION filter_trading_hours() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Time-based access control for trading positions outside market hours' +RETURN + -- Allow access outside NYSE trading hours (9:30 AM - 4:00 PM ET) + -- Convert to UTC: 9:30 AM ET = 14:30 UTC, 4:00 PM ET = 21:00 UTC (EST) + -- Note: Adjust for daylight saving time in production + CASE + WHEN hour(current_timestamp()) < 14 OR hour(current_timestamp()) >= 21 THEN TRUE + ELSE FALSE + END; + +-- ============================================= +-- 10. INFORMATION BARRIER FILTER (Chinese Wall) +-- Purpose: Block research analysts from trading data +-- Usage: Enforce SEC/MiFID II Chinese wall for research analysts +-- Input: None (checks current user group membership) +-- Output: Boolean - FALSE blocks access for Research_Analyst group +-- ============================================= +CREATE OR REPLACE FUNCTION filter_information_barrier() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Chinese wall - block research analysts from trading positions' +RETURN + -- Research analysts are blocked (return FALSE to deny access) + -- This function is applied only to tables tagged with information_barrier + -- Risk managers and compliance have Neutral access (not blocked) + TRUE; -- Default allow - policy applies this selectively via WHEN clause + +-- ============================================= +-- 11. AML CLEARANCE FILTER +-- Purpose: Hide flagged/high-risk transactions from junior analysts +-- Usage: Junior AML analysts cannot see flagged transactions +-- Input: None (checks current user group membership) +-- Output: Boolean - controls visibility of sensitive AML data +-- ============================================= +CREATE OR REPLACE FUNCTION filter_aml_clearance() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Hide flagged transactions from junior AML analysts' +RETURN + -- Junior analysts blocked from flagged transactions + -- Senior investigators and compliance see all + TRUE; -- Default allow - policy WHEN clause controls application + +-- ============================================= +-- 12. REGIONAL DATA RESIDENCY FILTER - EU +-- Purpose: Show only EU customer data to EU staff +-- Usage: GDPR compliance - EU staff see EU data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_eu() +RETURNS BOOLEAN +COMMENT 'ABAC utility: GDPR - EU regional staff see EU customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='EU' tables + +-- ============================================= +-- 13. REGIONAL DATA RESIDENCY FILTER - US +-- Purpose: Show only US customer data to US staff +-- Usage: CCPA/GLBA compliance - US staff see US data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_us() +RETURNS BOOLEAN +COMMENT 'ABAC utility: CCPA/GLBA - US regional staff see US customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='US' tables + +-- ============================================= +-- 14. REGIONAL DATA RESIDENCY FILTER - APAC +-- Purpose: Show only APAC customer data to APAC staff +-- Usage: PDPA compliance - APAC staff see APAC data only +-- Input: None (checks current user group membership) +-- Output: Boolean indicating if row should be visible +-- ============================================= +CREATE OR REPLACE FUNCTION filter_by_region_apac() +RETURNS BOOLEAN +COMMENT 'ABAC utility: PDPA - APAC regional staff see APAC customer data only' +RETURN TRUE; -- Applied via WHEN clause to customer_region='APAC' tables + +-- ============================================= +-- 15. TEMPORARY AUDITOR ACCESS FILTER +-- Purpose: Grant access to external auditors (always allow within policy scope) +-- Usage: SOX compliance - external auditors with temporary access +-- Input: None (group membership determines access) +-- Output: Boolean indicating if access is allowed +-- ============================================= +CREATE OR REPLACE FUNCTION filter_audit_expiry() +RETURNS BOOLEAN +COMMENT 'ABAC utility: Temporary access control for external auditors (SOX compliance)' +RETURN TRUE; -- Applied via WHEN clause with audit_project tag + +-- ============================================= +-- VERIFICATION AND TESTING +-- ============================================= + +-- List all created functions +SHOW FUNCTIONS IN finance LIKE 'mask*'; +SHOW FUNCTIONS IN finance LIKE 'filter*'; + +SELECT 'āœ… Successfully created 15 finance ABAC functions (8 masking, 7 row filters)' as status; +SELECT 'šŸ“‹ Row filter functions are zero-argument for Unity Catalog ABAC policies' as note; +SELECT 'šŸ” Functions ready for: PCI-DSS, AML/KYC, GDPR, SOX, GLBA compliance' as compliance_frameworks; diff --git a/uc-quickstart/utils/genie/aws/examples/finance/0.2finance_database_schema.sql b/uc-quickstart/utils/genie/aws/examples/finance/0.2finance_database_schema.sql new file mode 100644 index 00000000..0b7eaa44 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/finance/0.2finance_database_schema.sql @@ -0,0 +1,403 @@ +-- ============================================= +-- DATABRICKS UNITY CATALOG - FINANCE DOMAIN DATABASE SCHEMA +-- Purpose: Create comprehensive financial services database for ABAC demonstrations +-- Compliance: PCI-DSS, AML/KYC, GDPR, SOX, GLBA +-- Tables: Customers, Accounts, Transactions, CreditCards, TradingPositions, AMLAlerts, AuditLogs +-- ============================================= + +USE CATALOG fincat; + +USE SCHEMA finance; + +-- ============================================= +-- TABLE 1: CUSTOMERS +-- Purpose: Core customer master data with PII +-- Compliance: GDPR, GLBA, CCPA +-- ============================================= +DROP TABLE IF EXISTS Customers; + +CREATE TABLE Customers ( + CustomerID STRING NOT NULL, + FirstName STRING, + LastName STRING, + Email STRING, + SSN STRING COMMENT 'Social Security Number - PII/Sensitive', + DateOfBirth DATE, + Address STRING, + City STRING, + State STRING, + ZipCode STRING, + CustomerRegion STRING COMMENT 'Data residency region: EU, US, APAC, LATAM', + AccountOpenDate DATE, + CustomerStatus STRING COMMENT 'Active, Suspended, Closed', + RiskScore INT COMMENT 'AML risk score 1-100', + KYCVerificationDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Customer master data with PII for GDPR/GLBA compliance demonstrations' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +-- Insert sample customer data +INSERT INTO Customers VALUES + ('CUST00001', 'John', 'Smith', 'john.smith@email.com', '123-45-6789', '1975-03-15', '123 Main St', 'New York', 'NY', '10001', 'US', '2020-01-15', 'Active', 25, '2020-01-10', CURRENT_TIMESTAMP()), + ('CUST00002', 'Maria', 'Garcia', 'maria.garcia@email.com', '234-56-7890', '1982-07-22', '456 Oak Ave', 'Los Angeles', 'CA', '90001', 'US', '2019-05-20', 'Active', 15, '2019-05-15', CURRENT_TIMESTAMP()), + ('CUST00003', 'Hans', 'Mueller', 'hans.mueller@email.de', '345-67-8901', '1990-11-08', 'Berliner Str 78', 'Berlin', 'BE', '10115', 'EU', '2021-03-10', 'Active', 10, '2021-03-05', CURRENT_TIMESTAMP()), + ('CUST00004', 'Sophie', 'Dubois', 'sophie.dubois@email.fr', '456-78-9012', '1988-02-14', '12 Rue de Paris', 'Paris', 'IDF', '75001', 'EU', '2020-08-25', 'Active', 20, '2020-08-20', CURRENT_TIMESTAMP()), + ('CUST00005', 'Wei', 'Chen', 'wei.chen@email.cn', '567-89-0123', '1985-09-30', '88 Nanjing Rd', 'Shanghai', 'SH', '200001', 'APAC', '2021-11-12', 'Active', 30, '2021-11-10', CURRENT_TIMESTAMP()), + ('CUST00006', 'Sarah', 'Johnson', 'sarah.j@email.com', '678-90-1234', '1992-05-18', '789 Pine St', 'Chicago', 'IL', '60601', 'US', '2022-02-14', 'Active', 12, '2022-02-10', CURRENT_TIMESTAMP()), + ('CUST00007', 'Carlos', 'Silva', 'carlos.silva@email.br', '789-01-2345', '1978-12-03', 'Av Paulista 1000', 'Sao Paulo', 'SP', '01310', 'LATAM', '2019-09-08', 'Active', 45, '2019-09-05', CURRENT_TIMESTAMP()), + ('CUST00008', 'Yuki', 'Tanaka', 'yuki.tanaka@email.jp', '890-12-3456', '1995-06-25', '1-1-1 Shibuya', 'Tokyo', 'TK', '150-0001', 'APAC', '2022-07-19', 'Active', 8, '2022-07-15', CURRENT_TIMESTAMP()), + ('CUST00009', 'Emma', 'Wilson', 'emma.wilson@email.co.uk', '901-23-4567', '1987-04-12', '10 Downing St', 'London', 'LDN', 'SW1A', 'EU', '2020-12-05', 'Suspended', 75, '2020-12-01', CURRENT_TIMESTAMP()), + ('CUST00010', 'Ahmed', 'Al-Saud', 'ahmed.alsaud@email.sa', '012-34-5678', '1983-08-20', 'King Fahd Rd', 'Riyadh', 'RY', '11564', 'APAC', '2021-06-30', 'Active', 55, '2021-06-25', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 2: ACCOUNTS +-- Purpose: Bank accounts linked to customers +-- Compliance: GLBA, regional banking regulations +-- ============================================= +DROP TABLE IF EXISTS Accounts; + +CREATE TABLE Accounts ( + AccountID STRING NOT NULL, + CustomerID STRING NOT NULL, + AccountType STRING COMMENT 'Checking, Savings, Investment, Credit', + Balance DECIMAL(18,2), + Currency STRING DEFAULT 'USD', + OpenDate DATE, + AccountStatus STRING COMMENT 'Active, Frozen, Closed', + AccountRegion STRING COMMENT 'Region where account is held', + InterestRate DECIMAL(5,4), + LastTransactionDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Bank account information for balance and transaction tracking' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO Accounts VALUES + ('ACC1001', 'CUST00001', 'Checking', 15234.50, 'USD', '2020-01-15', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1002', 'CUST00001', 'Savings', 45678.90, 'USD', '2020-01-15', 'Active', 'US', 0.0350, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1003', 'CUST00002', 'Checking', 8945.75, 'USD', '2019-05-20', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1004', 'CUST00003', 'Checking', 12456.30, 'EUR', '2021-03-10', 'Active', 'EU', 0.0100, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1005', 'CUST00003', 'Investment', 78900.00, 'EUR', '2021-06-15', 'Active', 'EU', 0.0000, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1006', 'CUST00004', 'Savings', 23567.85, 'EUR', '2020-08-25', 'Active', 'EU', 0.0300, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1007', 'CUST00005', 'Checking', 34567.20, 'CNY', '2021-11-12', 'Active', 'APAC', 0.0200, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1008', 'CUST00006', 'Checking', 5678.40, 'USD', '2022-02-14', 'Active', 'US', 0.0125, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1009', 'CUST00007', 'Savings', 67890.50, 'BRL', '2019-09-08', 'Active', 'LATAM', 0.0650, '2026-02-08', CURRENT_TIMESTAMP()), + ('ACC1010', 'CUST00009', 'Checking', 2345.60, 'GBP', '2020-12-05', 'Frozen', 'EU', 0.0150, '2026-02-08', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 3: TRANSACTIONS (RECREATED FOR FRAUD AI DEMO) +-- Purpose: Transaction history for AML monitoring + AI reasoning +-- Compliance: AML/KYC, FATF, FinCEN +-- ============================================= + +DROP TABLE IF EXISTS Transactions; + +CREATE TABLE Transactions ( + TransactionID STRING NOT NULL, + AccountID STRING NOT NULL, + TransactionDate TIMESTAMP, + Amount DECIMAL(18,2), + Currency STRING DEFAULT 'USD', + TransactionType STRING COMMENT 'Deposit, Withdrawal, Transfer, Payment', + CountryCode STRING COMMENT 'Country where transaction originated', + MerchantName STRING, + TransactionStatus STRING COMMENT 'Completed, Pending, Flagged, Blocked', + AMLFlagReason STRING COMMENT 'Large transaction, Cross-border, Suspicious pattern', + + -- Added for AI-driven fraud explanation + IsInternational BOOLEAN COMMENT 'TRUE if transaction is cross-border', + ExceedsHighRiskThreshold BOOLEAN COMMENT 'TRUE if amount exceeds high-risk threshold (e.g. >= 10000)', + + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Transaction history for AML/KYC monitoring and fraud investigation with AI context' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO Transactions VALUES +-- Normal domestic payments +('TXN000001', 'ACC1001', '2026-02-08 14:35:22', 234.50, 'USD', 'Payment', 'US', 'Amazon.com', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000002', 'ACC1001', '2026-02-08 09:12:45', 1500.00, 'USD', 'Deposit', 'US', 'Payroll Direct Deposit', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000008', 'ACC1002', '2026-02-08 10:15:55', 500.00, 'USD', 'Payment', 'US', 'Utility Bill', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), +('TXN000010', 'ACC1008', '2026-02-08 12:05:33', 78.90, 'USD', 'Payment', 'US', 'Coffee Shop', 'Completed', NULL, FALSE, FALSE, CURRENT_TIMESTAMP()), + +-- Large but explainable withdrawals (kept) +('TXN000003', 'ACC1003', '2026-02-08 16:20:10', 15000.00, 'USD', 'Withdrawal', 'US', 'Cash Withdrawal ATM', 'Flagged', 'Large transaction', FALSE, TRUE, CURRENT_TIMESTAMP()), + +-- Existing international transfers (kept) +('TXN000004', 'ACC1004', '2026-02-08 11:45:30', 8500.00, 'EUR', 'Transfer', 'DE', 'International Wire', 'Completed', NULL, TRUE, FALSE, CURRENT_TIMESTAMP()), +('TXN000005', 'ACC1007', '2026-02-08 08:30:15', 25000.00, 'CNY', 'Transfer', 'CN', 'Business Payment', 'Completed', NULL, TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- High-risk cash activity (kept) +('TXN000006', 'ACC1009', '2026-02-08 19:55:40', 45000.00, 'BRL', 'Deposit', 'BR', 'Large Cash Deposit', 'Flagged', 'Large transaction', FALSE, TRUE, CURRENT_TIMESTAMP()), + +-- Existing blocked transfer (kept) +('TXN000007', 'ACC1010', '2026-02-08 14:22:18', 12000.00, 'GBP', 'Transfer', 'GB', 'Suspicious Transfer', 'Blocked', 'Suspicious pattern', TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- Investment-related transfer (kept) +('TXN000009', 'ACC1005', '2026-02-08 15:40:25', 12500.00, 'EUR', 'Transfer', 'FR', 'Investment Purchase', 'Completed', NULL, TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- ============================================= +-- DEMO: TWO TOP URGENT ALERT TRANSACTIONS (NEW) +-- ============================================= + +-- āœ… DEMO #1 (Customer aware / reasonable): large first-time international transfer for CUST00001 +('TXN_DEMO_01', 'ACC1001', '2026-02-08 08:30:00', 18000.00, 'USD', 'Transfer', 'DE', 'International Wire - Property Settlement', 'Flagged', 'Cross-border', TRUE, TRUE, CURRENT_TIMESTAMP()), + +-- 🚨 DEMO #2 (Customer unreachable): large international transfer for CUST00009 (already Frozen account ACC1010) +('TXN_DEMO_02', 'ACC1010', '2026-02-08 08:40:00', 22000.00, 'GBP', 'Transfer', 'GB', 'International Wire - Beneficiary Added Recently', 'Blocked', 'Suspicious pattern', TRUE, TRUE, CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 4: CREDIT CARDS +-- Purpose: Credit card information for PCI-DSS compliance +-- Compliance: PCI-DSS +-- ============================================= +DROP TABLE IF EXISTS CreditCards; + +CREATE TABLE CreditCards ( + CardID STRING NOT NULL, + CustomerID STRING NOT NULL, + CardNumber STRING COMMENT 'Full card number - PCI-DSS Sensitive', + CVV STRING COMMENT 'Card Verification Value - PCI-DSS Sensitive', + ExpirationDate STRING, + CardType STRING COMMENT 'Visa, Mastercard, Amex, Discover', + CardStatus STRING COMMENT 'Active, Blocked, Expired', + CreditLimit DECIMAL(18,2), + CurrentBalance DECIMAL(18,2), + LastUsedDate DATE, + IssueDate DATE, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Credit card master data for PCI-DSS compliance demonstrations' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO CreditCards VALUES + ('CARD0001', 'CUST00001', '4532-1234-5678-9010', '123', '12/2026', 'Visa', 'Active', 10000.00, 2345.60, '2026-02-08', '2020-01-15', CURRENT_TIMESTAMP()), + ('CARD0002', 'CUST00002', '5425-2345-6789-0123', '456', '06/2025', 'Mastercard', 'Active', 5000.00, 1234.50, '2026-02-08', '2019-05-20', CURRENT_TIMESTAMP()), + ('CARD0003', 'CUST00003', '3782-456789-01234', '789', '09/2027', 'Amex', 'Active', 15000.00, 5678.90, '2026-02-08', '2021-03-10', CURRENT_TIMESTAMP()), + ('CARD0004', 'CUST00004', '6011-3456-7890-1234', '234', '03/2026', 'Discover', 'Active', 8000.00, 3456.70, '2026-02-08', '2020-08-25', CURRENT_TIMESTAMP()), + ('CARD0005', 'CUST00005', '4916-4567-8901-2345', '567', '11/2025', 'Visa', 'Active', 12000.00, 4567.80, '2026-02-08', '2021-11-12', CURRENT_TIMESTAMP()), + ('CARD0006', 'CUST00006', '5500-5678-9012-3456', '890', '05/2026', 'Mastercard', 'Active', 3000.00, 567.90, '2026-02-08', '2022-02-14', CURRENT_TIMESTAMP()), + ('CARD0007', 'CUST00007', '4485-6789-0123-4567', '321', '08/2027', 'Visa', 'Active', 20000.00, 12345.00, '2026-02-08', '2019-09-08', CURRENT_TIMESTAMP()), + ('CARD0008', 'CUST00009', '5425-7890-1234-5678', '654', '02/2024', 'Mastercard', 'Blocked', 7000.00, 6789.50, '2026-02-08', '2020-12-05', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 5: TRADING POSITIONS +-- Purpose: Trading desk positions for Chinese wall enforcement +-- Compliance: SEC, MiFID II, insider trading prevention +-- ============================================= +DROP TABLE IF EXISTS TradingPositions; + +CREATE TABLE TradingPositions ( + PositionID STRING NOT NULL, + TraderID STRING NOT NULL COMMENT 'User ID of trader', + SecurityID STRING NOT NULL COMMENT 'Stock ticker or security identifier', + SecurityName STRING, + Quantity INT, + EntryPrice DECIMAL(18,4), + CurrentPrice DECIMAL(18,4), + PnL DECIMAL(18,2) COMMENT 'Profit and Loss', + TradingDesk STRING COMMENT 'Equity, Fixed_Income, FX, Commodities', + PositionDate DATE, + PositionStatus STRING COMMENT 'Open, Closed', + InformationBarrier STRING COMMENT 'Trading_Side, Advisory_Side, Neutral', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Trading positions for Chinese wall and insider trading prevention' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO TradingPositions VALUES + ('POS00001', 'TRADER001', 'AAPL', 'Apple Inc', 1000, 150.25, 175.50, 25250.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00002', 'TRADER001', 'GOOGL', 'Alphabet Inc', 500, 2800.00, 2950.75, 75375.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00003', 'TRADER002', 'TSLA', 'Tesla Inc', 2000, 185.50, 165.25, -40500.00, 'Equity', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00004', 'TRADER003', 'US10Y', 'US 10-Year Treasury', 10000000, 98.50, 99.25, 75000.00, 'Fixed_Income', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00005', 'TRADER004', 'EURUSD', 'Euro/US Dollar', 5000000, 1.0850, 1.0920, 35000.00, 'FX', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()), + ('POS00006', 'TRADER005', 'GC', 'Gold Futures', 100, 2050.00, 2075.50, 2550.00, 'Commodities', '2026-02-08', 'Open', 'Trading_Side', CURRENT_TIMESTAMP()); + +-- ============================================= +-- TABLE 6: AML ALERTS +-- Purpose: Anti-Money Laundering alert management +-- Compliance: AML/KYC, FATF, FinCEN +-- ============================================= +DROP TABLE IF EXISTS AMLAlerts; + +CREATE TABLE AMLAlerts ( + AlertID STRING NOT NULL, + CustomerID STRING NOT NULL, + TransactionID STRING, + AlertDate TIMESTAMP, + AlertType STRING COMMENT 'Large Transaction, Structuring, Cross-Border, Rapid Movement', + RiskScore INT COMMENT '1-100 risk assessment', + InvestigationStatus STRING COMMENT 'New, Under Review, Escalated, Cleared, SAR Filed', + AssignedInvestigator STRING, + InvestigationNotes STRING COMMENT 'Sensitive investigation details', + ResolutionDate TIMESTAMP, + SARFiled BOOLEAN COMMENT 'Suspicious Activity Report filed with FinCEN', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'AML alerts and investigation tracking for compliance monitoring' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO AMLAlerts VALUES +-- āœ… DEMO #1 (Customer aware) - still urgent but slightly lower than DEMO #2 +( + 'AML_DEMO_01', + 'CUST00001', + 'TXN_DEMO_01', + '2026-02-08 09:00:00', + 'Cross-Border', + 88, + 'Under Review', + 'AML_INV_DEMO', + 'First-time large international transfer flagged by threshold and cross-border controls', + NULL, + FALSE, + CURRENT_TIMESTAMP() +), + +-- 🚨 DEMO #2 (Customer unreachable) - highest urgency +( + 'AML_DEMO_02', + 'CUST00009', + 'TXN_DEMO_02', + '2026-02-08 09:05:00', + 'Cross-Border', + 92, + 'Under Review', + 'AML_INV_DEMO', + 'Large international transfer blocked; account is frozen and customer could not be reached', + NULL, + FALSE, + CURRENT_TIMESTAMP() +); +-- ============================================= +-- TABLE 7: AUDIT LOGS +-- Purpose: Audit trail for SOX compliance +-- Compliance: SOX, regulatory audit requirements +-- ============================================= +DROP TABLE IF EXISTS AuditLogs; + +CREATE TABLE AuditLogs ( + LogID STRING NOT NULL, + UserID STRING NOT NULL, + UserRole STRING, + AccessTime TIMESTAMP, + TableAccessed STRING, + OperationType STRING COMMENT 'SELECT, INSERT, UPDATE, DELETE', + RecordsAffected INT, + AuditProject STRING COMMENT 'Q1_SOX_Audit, Annual_Financial_Audit, Regulatory_Review', + AccessGrantedUntil DATE COMMENT 'Temporary access expiration date', + IPAddress STRING, + SessionID STRING, + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Audit log for access tracking and SOX compliance' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO AuditLogs VALUES + ('LOG00001', 'auditor@external.com', 'External_Auditor', '2026-02-08 10:30:00', 'Accounts', 'SELECT', 150, 'Q1_SOX_Audit', '2026-02-08', '203.0.113.25', 'SESS_A1B2C3', CURRENT_TIMESTAMP()), + ('LOG00002', 'compliance@company.com', 'Compliance_Officer', '2026-02-08 14:20:00', 'AMLAlerts', 'SELECT', 45, 'Regulatory_Review', '2026-02-08', '198.51.100.42', 'SESS_D4E5F6', CURRENT_TIMESTAMP()), + ('LOG00003', 'analyst@company.com', 'AML_Investigator_Senior', '2026-02-08 09:15:00', 'Transactions', 'SELECT', 8932, NULL, '2026-02-08', '192.0.2.15', 'SESS_G7H8I9', CURRENT_TIMESTAMP()), + ('LOG00004', 'support@company.com', 'Credit_Card_Support', '2026-02-08 11:45:00', 'CreditCards', 'SELECT', 23, NULL, '2026-02-08', '198.51.100.87', 'SESS_J1K2L3', CURRENT_TIMESTAMP()); + +DROP TABLE IF EXISTS CustomerInteractions; + +CREATE TABLE CustomerInteractions ( + InteractionID STRING NOT NULL, + CustomerID STRING NOT NULL, + InteractionTime TIMESTAMP, + Channel STRING COMMENT 'Call, Chat, Email', + AgentID STRING, + InteractionNotes STRING COMMENT 'Free-text customer interaction notes', + CreatedDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +COMMENT 'Customer interaction history used for fraud investigation context' +TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported'); + +INSERT INTO CustomerInteractions VALUES +-- āœ… Customer aware -> approve/monitor +( + 'INT_DEMO_01', + 'CUST00001', + '2026-02-08 08:45:00', + 'Call', + 'AGENT_101', + 'Customer confirmed the international transfer was intentional and related to an overseas property purchase. Customer acknowledged the amount and destination account.', + CURRENT_TIMESTAMP() +), + +-- 🚨 Customer unreachable -> escalate +( + 'INT_DEMO_02', + 'CUST00009', + '2026-02-08 08:50:00', + 'Call', + 'AGENT_102', + 'Multiple attempts were made to contact the customer regarding the international transfer. No response was received and the customer could not be reached.', + CURRENT_TIMESTAMP() +); + +-- ============================================= +-- VERIFICATION +-- ============================================= + +-- Show all created tables +SHOW TABLES IN finance; + +-- Display row counts +SELECT 'Customers' as table_name, COUNT(*) as row_count FROM Customers +UNION ALL +SELECT 'Accounts', COUNT(*) FROM Accounts +UNION ALL +SELECT 'Transactions', COUNT(*) FROM Transactions +UNION ALL +SELECT 'CreditCards', COUNT(*) FROM CreditCards +UNION ALL +SELECT 'TradingPositions', COUNT(*) FROM TradingPositions +UNION ALL +SELECT 'AMLAlerts', COUNT(*) FROM AMLAlerts +UNION ALL +SELECT 'AuditLogs', COUNT(*) FROM AuditLogs +ORDER BY table_name; + +SELECT 'āœ… Successfully created 7 finance tables with sample data' as status; +SELECT 'šŸ“Š Tables: Customers, Accounts, Transactions, CreditCards, TradingPositions, AMLAlerts, AuditLogs' as tables_created; +SELECT 'šŸ” Ready for: PCI-DSS, AML/KYC, GDPR, SOX, GLBA compliance demonstrations' as compliance_ready; + + +-- Show the two top urgent alerts +SELECT + a.AlertID, + a.AlertDate, + a.RiskScore, + a.InvestigationStatus, + a.CustomerID, + a.TransactionID +FROM AMLAlerts a +ORDER BY a.RiskScore DESC, a.AlertDate DESC; + +-- Verify both demo transactions exist and are international + exceed threshold +SELECT + TransactionID, + AccountID, + TransactionDate, + Amount, + Currency, + CountryCode, + TransactionStatus, + AMLFlagReason, + IsInternational, + ExceedsHighRiskThreshold +FROM Transactions +WHERE TransactionID IN ('TXN_DEMO_01', 'TXN_DEMO_02') +ORDER BY TransactionDate; + +-- Verify interactions exist for both customers +SELECT + CustomerID, + InteractionTime, + Channel, + AgentID, + InteractionNotes +FROM CustomerInteractions +ORDER BY InteractionTime DESC; diff --git a/uc-quickstart/utils/genie/aws/examples/finance/finance.tfvars.example b/uc-quickstart/utils/genie/aws/examples/finance/finance.tfvars.example new file mode 100644 index 00000000..59019cb6 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/finance/finance.tfvars.example @@ -0,0 +1,149 @@ +# ============================================================================ +# Finance ABAC Example — Complete tfvars (ABAC config only) +# ============================================================================ +# This reproduces the original 5-group finance demo. +# Credentials go in auth.auto.tfvars; environment config in env.auto.tfvars. +# +# Setup: +# 1. cp auth.auto.tfvars.example auth.auto.tfvars (credentials — gitignored) +# cp env.auto.tfvars.example env.auto.tfvars (tables + environment) +# 2. cp examples/finance/finance.tfvars.example abac.auto.tfvars +# 3. Run examples/finance/0.1finance_abac_functions.sql in SQL editor +# 4. Run examples/finance/0.2finance_database_schema.sql in SQL editor +# 5. terraform apply +# ============================================================================ + +# === Groups === +groups = { + "Junior_Analyst" = { description = "Masked PII, last-4 card, rounded amounts" } + "Senior_Analyst" = { description = "Full PII, full card, full amounts" } + "US_Region_Staff" = { description = "Row access limited to US data" } + "EU_Region_Staff" = { description = "Row access limited to EU data" } + "Compliance_Officer" = { description = "Full unmasked access" } +} + +# === Tag policies === +tag_policies = [ + { key = "pii_level", description = "PII access level", values = ["Limited_PII", "Full_PII"] }, + { key = "pci_clearance", description = "PCI-DSS clearance", values = ["Basic", "Full", "Administrative"] }, + { key = "aml_clearance", description = "AML investigation clearance", values = ["Junior_Analyst", "Senior_Investigator", "Compliance_Officer"] }, + { key = "customer_region", description = "Customer data region", values = ["Regional", "US", "EU"] }, + { key = "data_residency", description = "Data residency", values = ["Global", "US", "EU"] }, +] + +# === Tag assignments === +# entity_name is relative to uc_catalog_name.uc_schema_name. +# For tables: "TableName" +# For columns: "TableName.ColumnName" +tag_assignments = [ + # Customers table + { entity_type = "tables", entity_name = "Customers", tag_key = "data_residency", tag_value = "Global" }, + { entity_type = "tables", entity_name = "Customers", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "tables", entity_name = "Customers", tag_key = "customer_region", tag_value = "Regional" }, + { entity_type = "columns", entity_name = "Customers.CustomerRegion", tag_key = "customer_region", tag_value = "EU" }, + { entity_type = "columns", entity_name = "Customers.CustomerRegion", tag_key = "data_residency", tag_value = "EU" }, + { entity_type = "columns", entity_name = "Customers.SSN", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "columns", entity_name = "Customers.SSN", tag_key = "data_residency", tag_value = "US" }, + { entity_type = "columns", entity_name = "Customers.FirstName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Customers.LastName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Customers.Email", tag_key = "pii_level", tag_value = "Limited_PII" }, + + # CreditCards table + { entity_type = "tables", entity_name = "CreditCards", tag_key = "pci_clearance", tag_value = "Full" }, + { entity_type = "columns", entity_name = "CreditCards.CardNumber", tag_key = "pci_clearance", tag_value = "Full" }, + { entity_type = "columns", entity_name = "CreditCards.CVV", tag_key = "pci_clearance", tag_value = "Administrative" }, + + # Transactions table + { entity_type = "tables", entity_name = "Transactions", tag_key = "aml_clearance", tag_value = "Senior_Investigator" }, + { entity_type = "columns", entity_name = "Transactions.Amount", tag_key = "aml_clearance", tag_value = "Junior_Analyst" }, + + # Accounts table + { entity_type = "tables", entity_name = "Accounts", tag_key = "data_residency", tag_value = "Global" }, + { entity_type = "tables", entity_name = "Accounts", tag_key = "customer_region", tag_value = "Regional" }, +] + +# === FGAC policies === +# function_name is relative to uc_catalog_name.uc_schema_name (just the function name). +fgac_policies = [ + # PII masking — junior analysts + { + name = "pii_junior_mask" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Junior_Analyst"] + comment = "PII: Mask names and email for junior analysts" + match_condition = "hasTagValue('pii_level', 'Limited_PII')" + match_alias = "pii_cols" + function_name = "mask_pii_partial" + }, + { + name = "pii_junior_ssn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Junior_Analyst"] + comment = "PII: Mask SSN for junior analysts" + match_condition = "hasTagValue('pii_level', 'Full_PII') AND hasTagValue('data_residency', 'US')" + match_alias = "ssn_cols" + function_name = "mask_ssn" + }, + + # PCI — credit card masking + { + name = "pci_junior_last4" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Junior_Analyst"] + comment = "Card: Last 4 digits only for junior analysts" + match_condition = "hasTagValue('pci_clearance', 'Full')" + match_alias = "card_cols" + function_name = "mask_credit_card_last4" + }, + { + name = "pci_cvv_mask_except_compliance" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["account users"] + except_principals = ["Compliance_Officer"] + comment = "Card: Mask CVV for all except Compliance_Officer" + match_condition = "hasTagValue('pci_clearance', 'Administrative')" + match_alias = "cvv_cols" + function_name = "mask_credit_card_full" + }, + + # AML — transaction amount rounding + { + name = "aml_junior_round" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Junior_Analyst"] + comment = "Transactions: Round amount for junior analysts" + match_condition = "hasTagValue('aml_clearance', 'Junior_Analyst')" + match_alias = "aml_cols" + function_name = "mask_amount_rounded" + }, + + # Regional row filters + { + name = "region_us" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["US_Region_Staff"] + comment = "Region: US staff see US customer data only" + when_condition = "hasTagValue('customer_region', 'Regional')" + function_name = "filter_by_region_us" + }, + { + name = "region_eu" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["EU_Region_Staff"] + comment = "Region: EU staff see EU customer data only" + when_condition = "hasTagValue('customer_region', 'Regional')" + function_name = "filter_by_region_eu" + }, +] + +# === Group members (optional) === +# Map of group name -> list of account-level user IDs. +group_members = { + "Junior_Analyst" = ["4170683363832239"] + "US_Region_Staff" = ["4170683363832239"] + "Senior_Analyst" = ["6016306480479573", "1493916322305156"] + "EU_Region_Staff" = ["6016306480479573", "1493916322305156"] +} + +# === Genie Space (optional) === +# genie_space_id = "" diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/ABAC_PROMPT_HEALTHCARE.md b/uc-quickstart/utils/genie/aws/examples/healthcare/ABAC_PROMPT_HEALTHCARE.md new file mode 100644 index 00000000..c7a86c9f --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/ABAC_PROMPT_HEALTHCARE.md @@ -0,0 +1,202 @@ +You are an expert in Databricks Unity Catalog Attribute-Based Access Control (ABAC). I will give you my table schemas. You will analyze the columns for sensitivity (PII, financial, health, etc.), then generate two files: + +### What is ABAC? + +ABAC uses governed **tags** on tables/columns and **FGAC policies** (column masks + row filters) to control data access based on **group membership**. The flow is: + +1. Create **groups** (access tiers like "Junior_Analyst", "Admin") +2. Create **tag policies** (e.g., `sensitivity` with values `public`, `confidential`, `restricted`) +3. Assign **tags** to tables and columns +4. Create **FGAC policies** that match tagged columns/tables and apply masking functions for specific groups + +### Available Masking Function Patterns + +Use these signatures. Replace `{catalog}.{schema}` with the user's catalog and schema. + +**PII:** +- `mask_pii_partial(input STRING) RETURNS STRING` — first + last char visible, middle masked +- `mask_ssn(ssn STRING) RETURNS STRING` — last 4 digits of SSN visible +- `mask_email(email STRING) RETURNS STRING` — masks local part, keeps domain +- `mask_phone(phone STRING) RETURNS STRING` — last 4 digits visible +- `mask_full_name(name STRING) RETURNS STRING` — reduces to initials + +**Financial:** +- `mask_credit_card_full(card_number STRING) RETURNS STRING` — all digits hidden +- `mask_credit_card_last4(card_number STRING) RETURNS STRING` — last 4 visible +- `mask_account_number(account_id STRING) RETURNS STRING` — deterministic SHA-256 token +- `mask_amount_rounded(amount DECIMAL(18,2)) RETURNS DECIMAL(18,2)` — round to nearest 10/100 +- `mask_iban(iban STRING) RETURNS STRING` — country code + last 4 + +**Health:** +- `mask_mrn(mrn STRING) RETURNS STRING` — last 4 digits of MRN +- `mask_diagnosis_code(code STRING) RETURNS STRING` — ICD category visible, specifics hidden + +**General:** +- `mask_redact(input STRING) RETURNS STRING` — replace with `[REDACTED]` +- `mask_hash(input STRING) RETURNS STRING` — full SHA-256 hash +- `mask_nullify(input STRING) RETURNS STRING` — return NULL + +**Row Filters (zero-argument):** +- `filter_by_region_us() RETURNS BOOLEAN` — US regional filter +- `filter_by_region_eu() RETURNS BOOLEAN` — EU regional filter +- `filter_by_region_apac() RETURNS BOOLEAN` — APAC regional filter +- `filter_trading_hours() RETURNS BOOLEAN` — outside NYSE hours only +- `filter_audit_expiry() RETURNS BOOLEAN` — temporary auditor access + +If none of these fit, create a new function following the same pattern (NULL-safe CASE expression, COMMENT describing usage). + +### Output Format — File 1: `masking_functions.sql` + +```sql +USE CATALOG {catalog}; +USE SCHEMA {schema}; + +CREATE OR REPLACE FUNCTION function_name(param TYPE) +RETURNS TYPE +COMMENT 'description' +RETURN CASE ... END; +``` + +Only include functions the user actually needs. If a library function works as-is, still include it so the user has a self-contained SQL file. + +### Output Format — File 2: `abac.auto.tfvars` + +```hcl +# Authentication (user fills in) +databricks_account_id = "" +databricks_client_id = "" +databricks_client_secret = "" +databricks_workspace_id = "" +databricks_workspace_host = "" + +uc_catalog_name = "{catalog}" +uc_schema_name = "{schema}" + +groups = { + "GroupName" = { description = "What this group can see" } +} + +tag_policies = [ + { key = "tag_name", description = "...", values = ["val1", "val2"] }, +] + +# entity_name and function_name are RELATIVE to uc_catalog_name.uc_schema_name. +# Terraform automatically prepends the catalog.schema prefix. +tag_assignments = [ + { entity_type = "columns", entity_name = "Table.Column", tag_key = "tag_name", tag_value = "val1" }, +] + +fgac_policies = [ + # Column mask: + { + name = "policy_name" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["GroupName"] + comment = "Description" + match_condition = "hasTagValue('tag_name', 'val1')" + match_alias = "alias" + function_name = "function_name" + }, + # Row filter: + { + name = "filter_name" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["GroupName"] + comment = "Description" + when_condition = "hasTagValue('tag_name', 'val1')" + function_name = "filter_function" + }, +] + +group_members = {} +``` + +### Validation + +After generating both files, the user should validate them before running `terraform apply`: + +```bash +pip install python-hcl2 +python validate_abac.py abac.auto.tfvars masking_functions.sql +``` + +This checks cross-references (groups, tags, functions), naming conventions, and structure. Fix any `[FAIL]` errors before proceeding. + +### Instructions + +1. Use the user's **catalog** and **schema** from the "MY CATALOG AND SCHEMA" section for `USE CATALOG` / `USE SCHEMA` in SQL and `uc_catalog_name` / `uc_schema_name` in tfvars +2. Analyze each column in the user's tables for sensitivity: + - PII (names, emails, SSN, phone, address) + - Financial (credit cards, account numbers, amounts, IBAN) + - Health (MRN, diagnosis codes) + - Regional/residency (region columns that need row filtering) +3. Propose groups — typically 2-5 access tiers (e.g., restricted, standard, privileged, admin) +4. Design tag policies — one per sensitivity dimension (e.g., `pii_level`, `pci_clearance`) +5. Map tags to the user's specific tables and columns +6. Select masking functions from the library above (or create new ones) +7. Generate both output files using **relative** names (Terraform prepends `uc_catalog_name.uc_schema_name` automatically) + +--- + +### MY CATALOG AND SCHEMA + +``` +Catalog: MY_CATALOG (e.g. prod_healthcare, my_dev_catalog) +Schema: clinical (e.g. clinical, finance, public) +``` + +### MY TABLES (paste below) + +``` +CREATE TABLE clinical.billing ( + BillingID BIGINT COMMENT 'Unique billing identifier', + PatientID BIGINT COMMENT 'FK to Patients', + EncounterID BIGINT COMMENT 'FK to Encounters', + TotalAmount DECIMAL(18,2) COMMENT 'Total billed amount', + InsurancePaid DECIMAL(18,2) COMMENT 'Amount covered by insurance', + PatientOwed DECIMAL(18,2) COMMENT 'Patient responsibility', + BillingCode STRING COMMENT 'CPT/HCPCS billing code', + InsuranceID STRING COMMENT 'Insurance policy used') +USING delta +TBLPROPERTIES ( + 'delta.enableDeletionVectors' = 'true', + 'delta.enableRowTracking' = 'true', + 'delta.feature.appendOnly' = 'supported', + 'delta.feature.deletionVectors' = 'supported', + 'delta.feature.domainMetadata' = 'supported', + 'delta.feature.invariants' = 'supported', + 'delta.feature.rowTracking' = 'supported', + 'delta.minReaderVersion' = '3', + 'delta.minWriterVersion' = '7', + 'delta.parquet.compression.codec' = 'zstd') + +CREATE TABLE encounters ( EncounterID BIGINT COMMENT 'Unique encounter identifier', PatientID BIGINT COMMENT 'FK to Patients', EncounterDate TIMESTAMP COMMENT 'Date/time of encounter', EncounterType STRING COMMENT 'INPATIENT, OUTPATIENT, EMERGENCY', DiagnosisCode STRING COMMENT 'ICD-10 diagnosis code', DiagnosisDesc STRING COMMENT 'Full diagnosis description', TreatmentNotes STRING COMMENT 'Free-text clinical notes', AttendingDoc STRING COMMENT 'Attending physician name', FacilityRegion STRING COMMENT 'Hospital region: US_EAST, US_WEST, EU') USING delta TBLPROPERTIES ( 'delta.enableDeletionVectors' = 'true', 'delta.enableRowTracking' = 'true', 'delta.feature.appendOnly' = 'supported', 'delta.feature.deletionVectors' = 'supported', 'delta.feature.domainMetadata' = 'supported', 'delta.feature.invariants' = 'supported', 'delta.feature.rowTracking' = 'supported', 'delta.minReaderVersion' = '3', 'delta.minWriterVersion' = '7', 'delta.parquet.compression.codec' = 'zstd') + +CREATE TABLE patients ( + PatientID BIGINT COMMENT 'Unique patient identifier', + MRN STRING COMMENT 'Medical Record Number', + FirstName STRING COMMENT 'Patient first name', + LastName STRING COMMENT 'Patient last name', + DateOfBirth DATE COMMENT 'Date of birth', + SSN STRING COMMENT 'Social Security Number', + Email STRING COMMENT 'Contact email', + Phone STRING COMMENT 'Contact phone number', + Address STRING COMMENT 'Home address', + InsuranceID STRING COMMENT 'Insurance policy number', + PrimaryCareDoc STRING COMMENT 'Assigned physician name', + FacilityRegion STRING COMMENT 'Hospital region: US_EAST, US_WEST, EU') +USING delta +TBLPROPERTIES ( + 'delta.enableDeletionVectors' = 'true', + 'delta.enableRowTracking' = 'true', + 'delta.feature.appendOnly' = 'supported', + 'delta.feature.deletionVectors' = 'supported', + 'delta.feature.domainMetadata' = 'supported', + 'delta.feature.invariants' = 'supported', + 'delta.feature.rowTracking' = 'supported', + 'delta.minReaderVersion' = '3', + 'delta.minWriterVersion' = '7', + 'delta.parquet.compression.codec' = 'zstd') + +CREATE TABLE prescriptions ( PrescriptionID BIGINT COMMENT 'Unique prescription identifier', PatientID BIGINT COMMENT 'FK to Patients', EncounterID BIGINT COMMENT 'FK to Encounters', DrugName STRING COMMENT 'Medication name', Dosage STRING COMMENT 'Dosage instructions', Quantity INT COMMENT 'Number of units prescribed', PrescribingDoc STRING COMMENT 'Prescribing physician', PrescribedDate DATE COMMENT 'Date prescribed') USING delta TBLPROPERTIES ( 'delta.enableDeletionVectors' = 'true', 'delta.enableRowTracking' = 'true', 'delta.feature.appendOnly' = 'supported', 'delta.feature.deletionVectors' = 'supported', 'delta.feature.domainMetadata' = 'supported', 'delta.feature.invariants' = 'supported', 'delta.feature.rowTracking' = 'supported', 'delta.minReaderVersion' = '3', 'delta.minWriterVersion' = '7', 'delta.parquet.compression.codec' = 'zstd') +``` diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/billing.sql b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/billing.sql new file mode 100644 index 00000000..a4ef1851 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/billing.sql @@ -0,0 +1,10 @@ +CREATE TABLE Billing ( + BillingID BIGINT COMMENT 'Unique billing identifier', + PatientID BIGINT COMMENT 'FK to Patients', + EncounterID BIGINT COMMENT 'FK to Encounters', + TotalAmount DECIMAL(18,2) COMMENT 'Total billed amount', + InsurancePaid DECIMAL(18,2) COMMENT 'Amount covered by insurance', + PatientOwed DECIMAL(18,2) COMMENT 'Patient responsibility', + BillingCode STRING COMMENT 'CPT/HCPCS billing code', + InsuranceID STRING COMMENT 'Insurance policy used' +); diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/encounters.sql b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/encounters.sql new file mode 100644 index 00000000..57e914dd --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/encounters.sql @@ -0,0 +1,11 @@ +CREATE TABLE Encounters ( + EncounterID BIGINT COMMENT 'Unique encounter identifier', + PatientID BIGINT COMMENT 'FK to Patients', + EncounterDate TIMESTAMP COMMENT 'Date/time of encounter', + EncounterType STRING COMMENT 'INPATIENT, OUTPATIENT, EMERGENCY', + DiagnosisCode STRING COMMENT 'ICD-10 diagnosis code', + DiagnosisDesc STRING COMMENT 'Full diagnosis description', + TreatmentNotes STRING COMMENT 'Free-text clinical notes', + AttendingDoc STRING COMMENT 'Attending physician name', + FacilityRegion STRING COMMENT 'Hospital region: US_EAST, US_WEST, EU' +); diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/patients.sql b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/patients.sql new file mode 100644 index 00000000..bd2e31c2 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/patients.sql @@ -0,0 +1,14 @@ +CREATE TABLE Patients ( + PatientID BIGINT COMMENT 'Unique patient identifier', + MRN STRING COMMENT 'Medical Record Number', + FirstName STRING COMMENT 'Patient first name', + LastName STRING COMMENT 'Patient last name', + DateOfBirth DATE COMMENT 'Date of birth', + SSN STRING COMMENT 'Social Security Number', + Email STRING COMMENT 'Contact email', + Phone STRING COMMENT 'Contact phone number', + Address STRING COMMENT 'Home address', + InsuranceID STRING COMMENT 'Insurance policy number', + PrimaryCareDoc STRING COMMENT 'Assigned physician name', + FacilityRegion STRING COMMENT 'Hospital region: US_EAST, US_WEST, EU' +); diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/prescriptions.sql b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/prescriptions.sql new file mode 100644 index 00000000..a5793b82 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/ddl/prescriptions.sql @@ -0,0 +1,10 @@ +CREATE TABLE Prescriptions ( + PrescriptionID BIGINT COMMENT 'Unique prescription identifier', + PatientID BIGINT COMMENT 'FK to Patients', + EncounterID BIGINT COMMENT 'FK to Encounters', + DrugName STRING COMMENT 'Medication name', + Dosage STRING COMMENT 'Dosage instructions', + Quantity INT COMMENT 'Number of units prescribed', + PrescribingDoc STRING COMMENT 'Prescribing physician', + PrescribedDate DATE COMMENT 'Date prescribed' +); diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare.tfvars.example b/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare.tfvars.example new file mode 100644 index 00000000..9e66bc4a --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare.tfvars.example @@ -0,0 +1,183 @@ +# Healthcare ABAC — Example abac.auto.tfvars (ABAC config only) +# Generated by the AI-Assisted workflow (Tier 3) from ABAC_PROMPT.md +# +# Credentials go in auth.auto.tfvars; environment config in env.auto.tfvars. +# +# Usage: +# 1. cp auth.auto.tfvars.example auth.auto.tfvars (credentials — gitignored) +# cp env.auto.tfvars.example env.auto.tfvars (tables + environment) +# 2. cp examples/healthcare/healthcare.tfvars.example abac.auto.tfvars +# 3. Run examples/healthcare/masking_functions.sql in a Databricks SQL editor +# 4. terraform init && terraform plan && terraform apply + +# === Groups === +groups = { + "Nurse" = { description = "Bedside care — partial PII, limited clinical notes" } + "Physician" = { description = "Full clinical access, full PII for their region" } + "Billing_Clerk" = { description = "Financial records — masked PHI, no clinical notes" } + "Chief_Medical_Officer" = { description = "Full unrestricted access across all regions" } + "US_East_Staff" = { description = "Row access limited to US_EAST facility data" } + "US_West_Staff" = { description = "Row access limited to US_WEST facility data" } +} + +# === Tag Policies === +tag_policies = [ + { key = "phi_level", description = "Protected Health Information access tier", values = ["Restricted_PHI", "Limited_PHI", "Full_PHI"] }, + { key = "pii_level", description = "Personally identifiable information tier", values = ["Limited_PII", "Full_PII"] }, + { key = "financial_access", description = "Billing/financial data clearance", values = ["Summary", "Full"] }, + { key = "facility_region", description = "Hospital facility region for row filtering", values = ["Regional"] }, +] + +# === Tag Assignments === +# entity_name is relative to uc_catalog_name.uc_schema_name. +tag_assignments = [ + # --- Patients table --- + { entity_type = "tables", entity_name = "Patients", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "tables", entity_name = "Patients", tag_key = "facility_region", tag_value = "Regional" }, + { entity_type = "columns", entity_name = "Patients.MRN", tag_key = "phi_level", tag_value = "Restricted_PHI" }, + { entity_type = "columns", entity_name = "Patients.FirstName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.LastName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.SSN", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "columns", entity_name = "Patients.Email", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.Phone", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.Address", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "columns", entity_name = "Patients.InsuranceID", tag_key = "phi_level", tag_value = "Limited_PHI" }, + + # --- Encounters table --- + { entity_type = "tables", entity_name = "Encounters", tag_key = "phi_level", tag_value = "Full_PHI" }, + { entity_type = "tables", entity_name = "Encounters", tag_key = "facility_region", tag_value = "Regional" }, + { entity_type = "columns", entity_name = "Encounters.DiagnosisCode", tag_key = "phi_level", tag_value = "Restricted_PHI" }, + { entity_type = "columns", entity_name = "Encounters.DiagnosisDesc", tag_key = "phi_level", tag_value = "Full_PHI" }, + { entity_type = "columns", entity_name = "Encounters.TreatmentNotes", tag_key = "phi_level", tag_value = "Full_PHI" }, + + # --- Prescriptions table --- + { entity_type = "tables", entity_name = "Prescriptions", tag_key = "phi_level", tag_value = "Limited_PHI" }, + { entity_type = "columns", entity_name = "Prescriptions.DrugName", tag_key = "phi_level", tag_value = "Limited_PHI" }, + { entity_type = "columns", entity_name = "Prescriptions.Dosage", tag_key = "phi_level", tag_value = "Limited_PHI" }, + + # --- Billing table --- + { entity_type = "tables", entity_name = "Billing", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.TotalAmount", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.InsurancePaid", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.PatientOwed", tag_key = "financial_access", tag_value = "Summary" }, + { entity_type = "columns", entity_name = "Billing.InsuranceID", tag_key = "phi_level", tag_value = "Limited_PHI" }, +] + +# === FGAC Policies === +# function_name is relative — Terraform prepends catalog.schema automatically. +fgac_policies = [ + # -- PII masking for Nurses -- + { + name = "pii_nurse_partial" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see partial names and contact info" + match_condition = "hasTagValue('pii_level', 'Limited_PII')" + match_alias = "pii_limited" + function_name = "mask_pii_partial" + }, + { + name = "pii_nurse_ssn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see last-4 SSN only" + match_condition = "hasTagValue('pii_level', 'Full_PII')" + match_alias = "pii_full" + function_name = "mask_ssn" + }, + + # -- PII masking for Billing Clerks -- + { + name = "pii_billing_partial" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks see partial patient names" + match_condition = "hasTagValue('pii_level', 'Limited_PII')" + match_alias = "pii_limited" + function_name = "mask_pii_partial" + }, + { + name = "pii_billing_ssn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks cannot see SSN or address" + match_condition = "hasTagValue('pii_level', 'Full_PII')" + match_alias = "pii_full" + function_name = "mask_redact" + }, + + # -- PHI masking for Nurses -- + { + name = "phi_nurse_mrn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see last-4 of MRN" + match_condition = "hasTagValue('phi_level', 'Restricted_PHI')" + match_alias = "phi_restricted" + function_name = "mask_mrn" + }, + + # -- PHI masking for Billing Clerks (no clinical details) -- + { + name = "phi_billing_redact" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks cannot see diagnosis or treatment notes" + match_condition = "hasTagValue('phi_level', 'Full_PHI')" + match_alias = "phi_full" + function_name = "mask_redact" + }, + { + name = "phi_billing_diagnosis" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks see ICD category only" + match_condition = "hasTagValue('phi_level', 'Restricted_PHI')" + match_alias = "phi_restricted" + function_name = "mask_diagnosis_code" + }, + + # -- Financial masking for Nurses -- + { + name = "fin_nurse_rounded" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see rounded billing amounts" + match_condition = "hasTagValue('financial_access', 'Full')" + match_alias = "fin_full" + function_name = "mask_amount_rounded" + }, + + # -- Insurance ID masking (tokenize for non-billing staff) -- + { + name = "phi_insurance_mask" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["account users"] + except_principals = ["Billing_Clerk", "Chief_Medical_Officer"] + comment = "Insurance ID tokenized for non-billing staff" + match_condition = "hasTagValue('phi_level', 'Limited_PHI')" + match_alias = "phi_limited" + function_name = "mask_account_number" + }, + + # -- Regional row filters -- + { + name = "region_us_east" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["US_East_Staff"] + comment = "US East staff see only US_EAST facility data" + when_condition = "hasTagValue('facility_region', 'Regional')" + function_name = "filter_facility_us_east" + }, + { + name = "region_us_west" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["US_West_Staff"] + comment = "US West staff see only US_WEST facility data" + when_condition = "hasTagValue('facility_region', 'Regional')" + function_name = "filter_facility_us_west" + }, +] + +# === Group Members (optional — fill in account-level user IDs) === +group_members = {} diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare_walkthrough.md b/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare_walkthrough.md new file mode 100644 index 00000000..66998a36 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/healthcare_walkthrough.md @@ -0,0 +1,387 @@ +# Healthcare ABAC — AI-Assisted Walkthrough + +This is a step-by-step example of the **Tier 3 (AI-Assisted)** workflow applied to a healthcare scenario. It shows exactly what you paste into the AI and what you get back. + +--- + +## Step 1 — Get your table DDL + +Run `DESCRIBE TABLE` or `SHOW CREATE TABLE` in a Databricks SQL editor for every table you want ABAC policies on. For this walkthrough we'll use four tables from a hospital data platform. + +The DDL files are in the [`ddl/`](ddl/) subfolder — one file per table: + +| File | Table | Description | +|------|-------|-------------| +| [`ddl/patients.sql`](ddl/patients.sql) | `Patients` | Demographics, contact info, insurance | +| [`ddl/encounters.sql`](ddl/encounters.sql) | `Encounters` | Visits, admissions, diagnoses, clinical notes | +| [`ddl/prescriptions.sql`](ddl/prescriptions.sql) | `Prescriptions` | Medications and dosages | +| [`ddl/billing.sql`](ddl/billing.sql) | `Billing` | Financial records, insurance claims | + +To use these with the automated generator: + +```bash +# 1. Set up config (one-time) +cp auth.auto.tfvars.example auth.auto.tfvars # credentials (gitignored) +cp env.auto.tfvars.example env.auto.tfvars # tables + environment + +# 2. Copy the healthcare DDL files into the ddl/ folder +cp examples/healthcare/ddl/*.sql ddl/ + +# 3. Generate (reads uc_tables from env.auto.tfvars) +python generate_abac.py +``` + +## Step 2 — Generate ABAC configuration + +**Option A — Automated** (recommended): Run the commands above and skip to Step 4. + +**Option B — Manual**: Open `ABAC_PROMPT.md`, copy the entire prompt section, and paste it into ChatGPT / Claude / Cursor. Then paste the DDL from the files above where it says `-- Paste your SHOW CREATE TABLE output or CREATE TABLE DDL here.` + +## Step 3 — AI generates two files + +The AI analyzes your columns and produces the following. + +### File 1: `masking_functions.sql` + +```sql +USE CATALOG ; -- same catalog you used in Step 1 +USE SCHEMA clinical; + +-- === PII Masking === + +CREATE OR REPLACE FUNCTION mask_pii_partial(input STRING) +RETURNS STRING +COMMENT 'Masks middle characters; shows first and last character only.' +RETURN CASE + WHEN input IS NULL THEN NULL + WHEN LENGTH(input) <= 2 THEN REPEAT('*', LENGTH(input)) + ELSE CONCAT(LEFT(input, 1), REPEAT('*', LENGTH(input) - 2), RIGHT(input, 1)) +END; + +CREATE OR REPLACE FUNCTION mask_ssn(ssn STRING) +RETURNS STRING +COMMENT 'Shows last 4 digits of SSN only.' +RETURN CASE + WHEN ssn IS NULL THEN NULL + ELSE CONCAT('***-**-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) +END; + +CREATE OR REPLACE FUNCTION mask_email(email STRING) +RETURNS STRING +COMMENT 'Masks local part of email; preserves domain.' +RETURN CASE + WHEN email IS NULL THEN NULL + WHEN INSTR(email, '@') = 0 THEN '****' + ELSE CONCAT('****@', SUBSTRING_INDEX(email, '@', -1)) +END; + +CREATE OR REPLACE FUNCTION mask_phone(phone STRING) +RETURNS STRING +COMMENT 'Shows last 4 digits of phone number.' +RETURN CASE + WHEN phone IS NULL THEN NULL + ELSE CONCAT('***-***-', RIGHT(REGEXP_REPLACE(phone, '[^0-9]', ''), 4)) +END; + +CREATE OR REPLACE FUNCTION mask_redact(input STRING) +RETURNS STRING +COMMENT 'Fully redacts the value.' +RETURN CASE + WHEN input IS NULL THEN NULL + ELSE '[REDACTED]' +END; + +-- === Health / PHI Masking === + +CREATE OR REPLACE FUNCTION mask_mrn(mrn STRING) +RETURNS STRING +COMMENT 'Masks MRN; shows last 4 characters.' +RETURN CASE + WHEN mrn IS NULL THEN NULL + WHEN LENGTH(mrn) <= 4 THEN REPEAT('*', LENGTH(mrn)) + ELSE CONCAT(REPEAT('*', LENGTH(mrn) - 4), RIGHT(mrn, 4)) +END; + +CREATE OR REPLACE FUNCTION mask_diagnosis_code(code STRING) +RETURNS STRING +COMMENT 'Shows ICD-10 category (first 3 chars); hides specifics.' +RETURN CASE + WHEN code IS NULL THEN NULL + WHEN LENGTH(code) <= 3 THEN code + ELSE CONCAT(LEFT(code, 3), '.xx') +END; + +-- === Financial Masking === + +CREATE OR REPLACE FUNCTION mask_amount_rounded(amount DECIMAL(18,2)) +RETURNS DECIMAL(18,2) +COMMENT 'Rounds to nearest 100 for approximate visibility.' +RETURN CASE + WHEN amount IS NULL THEN NULL + ELSE ROUND(amount, -2) +END; + +CREATE OR REPLACE FUNCTION mask_account_number(account_id STRING) +RETURNS STRING +COMMENT 'Replaces with deterministic SHA-256 token.' +RETURN CASE + WHEN account_id IS NULL THEN NULL + ELSE CONCAT('ACCT-', LEFT(SHA2(account_id, 256), 12)) +END; + +-- === Row Filters === + +CREATE OR REPLACE FUNCTION filter_facility_us_east() +RETURNS BOOLEAN +COMMENT 'Row filter: only rows where FacilityRegion = US_EAST for current group.' +RETURN + is_account_group_member('US_East_Staff') + OR is_account_group_member('Chief_Medical_Officer'); + +CREATE OR REPLACE FUNCTION filter_facility_us_west() +RETURNS BOOLEAN +COMMENT 'Row filter: only rows where FacilityRegion = US_WEST for current group.' +RETURN + is_account_group_member('US_West_Staff') + OR is_account_group_member('Chief_Medical_Officer'); +``` + +### File 2: `abac.auto.tfvars` (ABAC config only — auth is in `auth.auto.tfvars`) + +```hcl +# === Groups === +groups = { + "Nurse" = { description = "Bedside care — partial PII, limited clinical notes" } + "Physician" = { description = "Full clinical access, full PII for their region" } + "Billing_Clerk" = { description = "Financial records — masked PHI, no clinical notes" } + "Chief_Medical_Officer" = { description = "Full unrestricted access across all regions" } + "US_East_Staff" = { description = "Row access limited to US_EAST facility data" } + "US_West_Staff" = { description = "Row access limited to US_WEST facility data" } +} + +# === Tag Policies === +tag_policies = [ + { key = "phi_level", description = "Protected Health Information access tier", values = ["Restricted_PHI", "Limited_PHI", "Full_PHI"] }, + { key = "pii_level", description = "Personally identifiable information tier", values = ["Limited_PII", "Full_PII"] }, + { key = "financial_access", description = "Billing/financial data clearance", values = ["Summary", "Full"] }, + { key = "facility_region", description = "Hospital facility region for row filtering", values = ["Regional"] }, +] + +# === Tag Assignments === +# entity_name is relative to uc_catalog_name.uc_schema_name. +tag_assignments = [ + # --- Patients table --- + { entity_type = "tables", entity_name = "Patients", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "tables", entity_name = "Patients", tag_key = "facility_region", tag_value = "Regional" }, + { entity_type = "columns", entity_name = "Patients.MRN", tag_key = "phi_level", tag_value = "Restricted_PHI" }, + { entity_type = "columns", entity_name = "Patients.FirstName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.LastName", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.SSN", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "columns", entity_name = "Patients.Email", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.Phone", tag_key = "pii_level", tag_value = "Limited_PII" }, + { entity_type = "columns", entity_name = "Patients.Address", tag_key = "pii_level", tag_value = "Full_PII" }, + { entity_type = "columns", entity_name = "Patients.InsuranceID", tag_key = "phi_level", tag_value = "Limited_PHI" }, + + # --- Encounters table --- + { entity_type = "tables", entity_name = "Encounters", tag_key = "phi_level", tag_value = "Full_PHI" }, + { entity_type = "tables", entity_name = "Encounters", tag_key = "facility_region", tag_value = "Regional" }, + { entity_type = "columns", entity_name = "Encounters.DiagnosisCode", tag_key = "phi_level", tag_value = "Restricted_PHI" }, + { entity_type = "columns", entity_name = "Encounters.DiagnosisDesc", tag_key = "phi_level", tag_value = "Full_PHI" }, + { entity_type = "columns", entity_name = "Encounters.TreatmentNotes", tag_key = "phi_level", tag_value = "Full_PHI" }, + + # --- Prescriptions table --- + { entity_type = "tables", entity_name = "Prescriptions", tag_key = "phi_level", tag_value = "Limited_PHI" }, + { entity_type = "columns", entity_name = "Prescriptions.DrugName", tag_key = "phi_level", tag_value = "Limited_PHI" }, + { entity_type = "columns", entity_name = "Prescriptions.Dosage", tag_key = "phi_level", tag_value = "Limited_PHI" }, + + # --- Billing table --- + { entity_type = "tables", entity_name = "Billing", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.TotalAmount", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.InsurancePaid", tag_key = "financial_access", tag_value = "Full" }, + { entity_type = "columns", entity_name = "Billing.PatientOwed", tag_key = "financial_access", tag_value = "Summary" }, + { entity_type = "columns", entity_name = "Billing.InsuranceID", tag_key = "phi_level", tag_value = "Limited_PHI" }, +] + +# === FGAC Policies === +# function_name is relative — Terraform prepends catalog.schema automatically. +fgac_policies = [ + # -- PII masking for Nurses -- + { + name = "pii_nurse_partial" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see partial names and contact info" + match_condition = "hasTagValue('pii_level', 'Limited_PII')" + match_alias = "pii_limited" + function_name = "mask_pii_partial" + }, + { + name = "pii_nurse_ssn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see last-4 SSN only" + match_condition = "hasTagValue('pii_level', 'Full_PII')" + match_alias = "pii_full" + function_name = "mask_ssn" + }, + + # -- PII masking for Billing Clerks -- + { + name = "pii_billing_partial" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks see partial patient names" + match_condition = "hasTagValue('pii_level', 'Limited_PII')" + match_alias = "pii_limited" + function_name = "mask_pii_partial" + }, + { + name = "pii_billing_ssn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks cannot see SSN or address" + match_condition = "hasTagValue('pii_level', 'Full_PII')" + match_alias = "pii_full" + function_name = "mask_redact" + }, + + # -- PHI masking for Nurses -- + { + name = "phi_nurse_mrn" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see last-4 of MRN" + match_condition = "hasTagValue('phi_level', 'Restricted_PHI')" + match_alias = "phi_restricted" + function_name = "mask_mrn" + }, + + # -- PHI masking for Billing Clerks (no clinical details) -- + { + name = "phi_billing_redact" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks cannot see diagnosis or treatment notes" + match_condition = "hasTagValue('phi_level', 'Full_PHI')" + match_alias = "phi_full" + function_name = "mask_redact" + }, + { + name = "phi_billing_diagnosis" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Billing_Clerk"] + comment = "Billing clerks see ICD category only" + match_condition = "hasTagValue('phi_level', 'Restricted_PHI')" + match_alias = "phi_restricted" + function_name = "mask_diagnosis_code" + }, + + # -- Financial masking for Nurses -- + { + name = "fin_nurse_rounded" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["Nurse"] + comment = "Nurses see rounded billing amounts" + match_condition = "hasTagValue('financial_access', 'Full')" + match_alias = "fin_full" + function_name = "mask_amount_rounded" + }, + + # -- Insurance ID masking (tokenize for non-billing staff) -- + { + name = "phi_insurance_mask" + policy_type = "POLICY_TYPE_COLUMN_MASK" + to_principals = ["account users"] + except_principals = ["Billing_Clerk", "Chief_Medical_Officer"] + comment = "Insurance ID tokenized for non-billing staff" + match_condition = "hasTagValue('phi_level', 'Limited_PHI')" + match_alias = "phi_limited" + function_name = "mask_account_number" + }, + + # -- Regional row filters -- + { + name = "region_us_east" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["US_East_Staff"] + comment = "US East staff see only US_EAST facility data" + when_condition = "hasTagValue('facility_region', 'Regional')" + function_name = "filter_facility_us_east" + }, + { + name = "region_us_west" + policy_type = "POLICY_TYPE_ROW_FILTER" + to_principals = ["US_West_Staff"] + comment = "US West staff see only US_WEST facility data" + when_condition = "hasTagValue('facility_region', 'Regional')" + function_name = "filter_facility_us_west" + }, +] + +# === Group Members (optional — fill in account-level user IDs) === +group_members = {} +``` + +## Step 4 — Validate + +If you used the automated generator, validation runs automatically. For manual flow, save the AI output and run: + +```bash +pip install python-hcl2 +python validate_abac.py generated/abac.auto.tfvars generated/masking_functions.sql +``` + +Expected output: + +``` +============================================================ + ABAC Configuration Validation Report +============================================================ + [PASS] SQL file: 11 function(s) found + [PASS] groups: 6 group(s) defined + [PASS] tag_policies: 4 policy/ies, 9 total values + [PASS] tag_assignments: 23 assignment(s) + [PASS] fgac_policies: 11 policy/ies, 9 unique function(s) +------------------------------------------------------------ + RESULT: PASS (5 passed, 0 warnings, 0 errors) +============================================================ +``` + +All `[PASS]` — safe to proceed. + +## Step 5 — Deploy + +```bash +# 1. Run generated/masking_functions.sql in a Databricks SQL editor +# (make sure USE CATALOG / USE SCHEMA match your auth.auto.tfvars) + +# 2. Copy the generated ABAC config to the module root +cp generated/abac.auto.tfvars abac.auto.tfvars + +# 3. Apply (auth.auto.tfvars is loaded automatically) +terraform init +terraform plan # review the plan +terraform apply +``` + +## What each group sees after deployment + +| Column | Nurse | Physician | Billing Clerk | CMO | +|--------|-------|-----------|---------------|-----| +| `Patients.FirstName` | `J***n` | John | `J***n` | John | +| `Patients.SSN` | `***-**-1234` | 123-45-1234 | `[REDACTED]` | 123-45-1234 | +| `Patients.MRN` | `****5678` | MRN005678 | `****5678` | MRN005678 | +| `Encounters.DiagnosisCode` | E11.65 | E11.65 | `E11.xx` | E11.65 | +| `Encounters.TreatmentNotes` | _full text_ | _full text_ | `[REDACTED]` | _full text_ | +| `Billing.TotalAmount` | `$1,200.00` → `$1,200` | `$1,234.56` | `$1,234.56` | `$1,234.56` | +| `Patients.InsuranceID` | `ACCT-a1b2c3d4...` | `ACCT-a1b2c3d4...` | INS-9876543 | INS-9876543 | +| **Row visibility** | All regions | All regions | All regions | All regions | +| **US_East_Staff** | US_EAST rows only | — | — | — | + +## Key design decisions the AI made + +1. **Four sensitivity dimensions**: `phi_level`, `pii_level`, `financial_access`, `facility_region` — mapped to HIPAA categories +2. **Nurse vs Billing separation**: Nurses see clinical data but masked financials; Billing Clerks see financials but redacted clinical notes — classic HIPAA minimum necessary principle +3. **CMO as unrestricted**: `Chief_Medical_Officer` is excluded via `except_principals` where needed and has no masking policies applied +4. **Regional row filters**: `US_East_Staff` / `US_West_Staff` can only see encounters and patients from their facility — implemented with `is_account_group_member()` checks in the filter UDFs +5. **Insurance ID tokenized**: Deterministic SHA-256 hash so non-billing staff can still join across tables without seeing the real policy number diff --git a/uc-quickstart/utils/genie/aws/examples/healthcare/masking_functions.sql b/uc-quickstart/utils/genie/aws/examples/healthcare/masking_functions.sql new file mode 100644 index 00000000..27c5261e --- /dev/null +++ b/uc-quickstart/utils/genie/aws/examples/healthcare/masking_functions.sql @@ -0,0 +1,108 @@ +USE CATALOG ; -- replace with your catalog name +USE SCHEMA ; -- replace with your schema name + +-- === PII Masking === + +CREATE OR REPLACE FUNCTION mask_pii_partial(input STRING) +RETURNS STRING +COMMENT 'Masks middle characters; shows first and last character only.' +RETURN CASE + WHEN input IS NULL THEN NULL + WHEN LENGTH(input) <= 2 THEN REPEAT('*', LENGTH(input)) + ELSE CONCAT(LEFT(input, 1), REPEAT('*', LENGTH(input) - 2), RIGHT(input, 1)) +END; + +CREATE OR REPLACE FUNCTION mask_ssn(ssn STRING) +RETURNS STRING +COMMENT 'Shows last 4 digits of SSN only.' +RETURN CASE + WHEN ssn IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) = 9 THEN + CONCAT('XXX-XX-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE 'XXX-XX-XXXX' +END; + +CREATE OR REPLACE FUNCTION mask_email(email STRING) +RETURNS STRING +COMMENT 'Masks local part of email; preserves domain.' +RETURN CASE + WHEN email IS NULL THEN NULL + WHEN LOCATE('@', email) > 0 THEN + CONCAT('****', SUBSTRING(email, LOCATE('@', email))) + ELSE '****' +END; + +CREATE OR REPLACE FUNCTION mask_phone(phone STRING) +RETURNS STRING +COMMENT 'Shows last 4 digits of phone number.' +RETURN CASE + WHEN phone IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(phone, '[^0-9]', '')) >= 4 THEN + CONCAT('***-***-', RIGHT(REGEXP_REPLACE(phone, '[^0-9]', ''), 4)) + ELSE '***-***-****' +END; + +CREATE OR REPLACE FUNCTION mask_redact(input STRING) +RETURNS STRING +COMMENT 'Fully redacts the value.' +RETURN CASE + WHEN input IS NULL THEN NULL + ELSE '[REDACTED]' +END; + +-- === Health / PHI Masking === + +CREATE OR REPLACE FUNCTION mask_mrn(mrn STRING) +RETURNS STRING +COMMENT 'Masks MRN; shows last 4 characters.' +RETURN CASE + WHEN mrn IS NULL THEN NULL + WHEN LENGTH(mrn) <= 4 THEN REPEAT('*', LENGTH(mrn)) + ELSE CONCAT(REPEAT('*', LENGTH(mrn) - 4), RIGHT(mrn, 4)) +END; + +CREATE OR REPLACE FUNCTION mask_diagnosis_code(code STRING) +RETURNS STRING +COMMENT 'Shows ICD-10 category (first 3 chars); hides specifics.' +RETURN CASE + WHEN code IS NULL THEN NULL + WHEN LENGTH(code) <= 3 THEN code + WHEN LOCATE('.', code) > 0 THEN + CONCAT(SUBSTRING(code, 1, LOCATE('.', code)), 'XX') + ELSE CONCAT(LEFT(code, 3), '.XX') +END; + +-- === Financial Masking === + +CREATE OR REPLACE FUNCTION mask_amount_rounded(amount DECIMAL(18,2)) +RETURNS DECIMAL(18,2) +COMMENT 'Rounds to nearest 100 for approximate visibility.' +RETURN CASE + WHEN amount IS NULL THEN NULL + WHEN amount < 100 THEN ROUND(amount, -1) + ELSE ROUND(amount, -2) +END; + +CREATE OR REPLACE FUNCTION mask_account_number(account_id STRING) +RETURNS STRING +COMMENT 'Replaces with deterministic SHA-256 token.' +RETURN CASE + WHEN account_id IS NULL THEN NULL + ELSE CONCAT('ACCT-', LEFT(SHA2(account_id, 256), 12)) +END; + +-- === Row Filters === + +CREATE OR REPLACE FUNCTION filter_facility_us_east() +RETURNS BOOLEAN +COMMENT 'Row filter: only US_EAST facility data visible to regional staff.' +RETURN + is_account_group_member('US_East_Staff') + OR is_account_group_member('Chief_Medical_Officer'); + +CREATE OR REPLACE FUNCTION filter_facility_us_west() +RETURNS BOOLEAN +COMMENT 'Row filter: only US_WEST facility data visible to regional staff.' +RETURN + is_account_group_member('US_West_Staff') + OR is_account_group_member('Chief_Medical_Officer'); diff --git a/uc-quickstart/utils/genie/aws/fgac_policies.tf b/uc-quickstart/utils/genie/aws/fgac_policies.tf new file mode 100644 index 00000000..c00bf30e --- /dev/null +++ b/uc-quickstart/utils/genie/aws/fgac_policies.tf @@ -0,0 +1,62 @@ +# ============================================================================ +# FGAC Policies (data-driven) +# ============================================================================ +# Creates catalog-level ABAC policies from var.fgac_policies. +# Supports both POLICY_TYPE_COLUMN_MASK and POLICY_TYPE_ROW_FILTER. +# Each policy specifies its own catalog, function_catalog, and function_schema. +# +# Prerequisites: +# - Tag policies and entity tag assignments applied +# - Masking / filter UDFs deployed in the target catalog.schema +# - Groups assigned to the workspace +# ============================================================================ + +locals { + fgac_policy_map = { for p in var.fgac_policies : p.name => p } +} + +resource "time_sleep" "wait_for_tag_propagation" { + depends_on = [databricks_tag_policy.policies, databricks_entity_tag_assignment.assignments] + create_duration = "30s" +} + +resource "databricks_policy_info" "policies" { + for_each = local.fgac_policy_map + + provider = databricks.workspace + + name = "${each.value.catalog}_${each.key}" + on_securable_type = "CATALOG" + on_securable_fullname = each.value.catalog + policy_type = each.value.policy_type + for_securable_type = "TABLE" + to_principals = each.value.to_principals + except_principals = length(each.value.except_principals) > 0 ? each.value.except_principals : null + comment = each.value.comment + + when_condition = each.value.when_condition + + match_columns = each.value.policy_type == "POLICY_TYPE_COLUMN_MASK" ? [{ + condition = each.value.match_condition + alias = each.value.match_alias + }] : null + + column_mask = each.value.policy_type == "POLICY_TYPE_COLUMN_MASK" ? { + function_name = "${each.value.function_catalog}.${each.value.function_schema}.${each.value.function_name}" + on_column = each.value.match_alias + using = [] + } : null + + row_filter = each.value.policy_type == "POLICY_TYPE_ROW_FILTER" ? { + function_name = "${each.value.function_catalog}.${each.value.function_schema}.${each.value.function_name}" + using = [] + } : null + + depends_on = [ + time_sleep.wait_for_tag_propagation, + databricks_mws_permission_assignment.group_assignments, + databricks_grant.catalog_access, + databricks_grant.terraform_sp_manage_catalog, + null_resource.deploy_masking_functions, + ] +} diff --git a/uc-quickstart/utils/genie/aws/generate_abac.py b/uc-quickstart/utils/genie/aws/generate_abac.py new file mode 100644 index 00000000..95bf0ccb --- /dev/null +++ b/uc-quickstart/utils/genie/aws/generate_abac.py @@ -0,0 +1,951 @@ +#!/usr/bin/env python3 +""" +Generate ABAC masking_functions.sql and abac.auto.tfvars from table DDL files. + +Reads DDL files from a folder (or fetches them live from Databricks), +combines them with the ABAC prompt template, sends to an LLM, and writes +the generated output files. Optionally runs validate_abac.py on the result. + +Authentication: + The script reads auth.auto.tfvars for Databricks credentials and + env.auto.tfvars for uc_tables and environment config. Catalog/schema + for UDF deployment are auto-derived from the first table in uc_tables + (override with --catalog / --schema). + +Supported LLM providers: + - databricks (default) — Claude Sonnet via Databricks Foundation Model API + - anthropic — Claude via the Anthropic API + - openai — GPT-4o / o1 via OpenAI API + +Usage: + # One-time setup + cp auth.auto.tfvars.example auth.auto.tfvars # credentials (gitignored) + cp env.auto.tfvars.example env.auto.tfvars # tables + environment (checked in) + # Edit env.auto.tfvars: + # uc_tables = ["prod.sales.customers", "prod.sales.orders", "prod.finance.*"] + + # Generate (reads tables from uc_tables; catalog/schema auto-derived) + python generate_abac.py + + # Or override tables via CLI + python generate_abac.py --tables prod.sales.customers prod.sales.orders + + # Use a specific provider / model + python generate_abac.py --provider anthropic --model claude-sonnet-4-20250514 + + # Fall back to local DDL files (legacy — requires --catalog / --schema) + cp my_tables.sql ddl/ + python generate_abac.py --catalog my_catalog --schema my_schema +""" + +import argparse +import os +import re +import shutil +import subprocess +import sys +import threading +import time +from pathlib import Path + +PRODUCT_NAME = "genierails" +PRODUCT_VERSION = "0.1.0" + +SCRIPT_DIR = Path(__file__).resolve().parent +PROMPT_TEMPLATE_PATH = SCRIPT_DIR / "ABAC_PROMPT.md" +DEFAULT_AUTH_FILE = SCRIPT_DIR / "auth.auto.tfvars" +DEFAULT_ENV_FILE = SCRIPT_DIR / "env.auto.tfvars" + +REQUIRED_PACKAGES = { + "python-hcl2": "hcl2", + "databricks-sdk": "databricks.sdk", +} + + +def _ensure_packages(): + """Auto-install required packages if missing.""" + missing = [] + for pip_name, import_name in REQUIRED_PACKAGES.items(): + try: + __import__(import_name) + except ImportError: + missing.append(pip_name) + if missing: + print(f" Installing missing packages: {', '.join(missing)}...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "--quiet", *missing], + ) + try: + __import__("databricks.sdk.useragent") + except (ImportError, ModuleNotFoundError): + print(" Upgrading databricks-sdk (need databricks.sdk.useragent)...") + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "--quiet", "--upgrade", "databricks-sdk"], + ) + + +_ensure_packages() + + +def _load_tfvars(path: Path, label: str) -> dict: + """Load a single .tfvars file. Returns empty dict if not found.""" + if not path.exists(): + return {} + import hcl2 + try: + with open(path) as f: + cfg = hcl2.load(f) + non_empty = {k: v for k, v in cfg.items() if v} + if non_empty: + print(f" Loaded {label} from: {path}") + return cfg + except Exception as e: + print(f" WARNING: Failed to parse {path}: {e}") + return {} + + +def load_auth_config(auth_file: Path, env_file: Path | None = None) -> dict: + """Load config from auth + env tfvars files. Merges both; env overrides auth.""" + cfg = _load_tfvars(auth_file, "credentials") + if env_file is None: + env_file = auth_file.parent / "env.auto.tfvars" + env_cfg = _load_tfvars(env_file, "environment") + cfg.update(env_cfg) + if "uc_tables" in cfg and cfg["uc_tables"]: + print(f" uc_tables: {', '.join(cfg['uc_tables'])}") + return cfg + + +def configure_databricks_env(auth_cfg: dict): + """Set Databricks SDK env vars from auth config if not already set.""" + mapping = { + "databricks_workspace_host": "DATABRICKS_HOST", + "databricks_client_id": "DATABRICKS_CLIENT_ID", + "databricks_client_secret": "DATABRICKS_CLIENT_SECRET", + } + for tfvar_key, env_key in mapping.items(): + val = auth_cfg.get(tfvar_key, "") + if val and not os.environ.get(env_key): + os.environ[env_key] = val + + +def load_ddl_files(ddl_dir: Path) -> str: + """Read all .sql files from ddl_dir and concatenate them.""" + sql_files = sorted(ddl_dir.glob("*.sql")) + if not sql_files: + print(f"ERROR: No .sql files found in {ddl_dir}") + print(" Place your CREATE TABLE / DESCRIBE TABLE DDL in .sql files there.") + sys.exit(1) + + parts = [] + for f in sql_files: + content = f.read_text().strip() + if content: + parts.append(f"-- Source: {f.name}\n{content}") + print(f" Loaded DDL: {f.name} ({len(content)} chars)") + + combined = "\n\n".join(parts) + print(f" Total DDL: {len(combined)} chars from {len(sql_files)} file(s)\n") + return combined + + +def _parse_table_ref(ref: str) -> tuple[str, str, str]: + """Parse 'catalog.schema.table' or 'catalog.schema.*' into parts.""" + parts = ref.split(".") + if len(parts) != 3: + print(f"ERROR: Invalid table reference '{ref}'") + print(" Expected format: catalog.schema.table or catalog.schema.*") + sys.exit(1) + return parts[0], parts[1], parts[2] + + +def format_table_info(table_info) -> str: + """Format a TableInfo object into CREATE TABLE DDL text.""" + full_name = table_info.full_name + lines = [f"-- Table: {full_name}"] + lines.append(f"CREATE TABLE {full_name} (") + if table_info.columns: + col_parts = [] + for col in table_info.columns: + type_text = col.type_text or "STRING" + part = f" {col.name} {type_text}" + if col.comment: + safe = col.comment.replace("'", "''") + part += f" COMMENT '{safe}'" + col_parts.append(part) + lines.append(",\n".join(col_parts)) + lines.append(");") + if table_info.comment: + lines.append(f"-- Table comment: {table_info.comment}") + return "\n".join(lines) + + +def fetch_tables_from_databricks( + table_refs: list[str], + auth_cfg: dict, +) -> tuple[str, list[tuple[str, str]]]: + """Fetch table DDLs from Databricks using the SDK. + + Returns (ddl_text, catalog_schema_pairs) where catalog_schema_pairs + is a deduplicated list of (catalog, schema) tuples found. + """ + from databricks.sdk import WorkspaceClient + + configure_databricks_env(auth_cfg) + w = WorkspaceClient(product=PRODUCT_NAME, product_version=PRODUCT_VERSION) + + tables = [] + for ref in table_refs: + catalog, schema, table = _parse_table_ref(ref) + if table == "*": + print(f" Listing tables in {catalog}.{schema}...") + for t in w.tables.list( + catalog_name=catalog, schema_name=schema + ): + tables.append(t) + print(f" Found: {t.full_name}") + else: + full_name = f"{catalog}.{schema}.{table}" + print(f" Fetching: {full_name}...") + t = w.tables.get(full_name=full_name) + tables.append(t) + + if not tables: + print("ERROR: No tables found for the given references.") + sys.exit(1) + + seen_pairs: dict[tuple[str, str], list[str]] = {} + parts = [] + for t in tables: + parts.append(format_table_info(t)) + cat = t.catalog_name + sch = t.schema_name + pair = (cat, sch) + seen_pairs.setdefault(pair, []).append(t.name) + + ddl_text = "\n\n".join(parts) + catalog_schemas = list(seen_pairs.keys()) + + print( + f" Fetched {len(tables)} table(s) from " + f"{len(catalog_schemas)} catalog.schema pair(s)\n" + ) + return ddl_text, catalog_schemas + + +def build_prompt(ddl_text: str, + catalog_schemas: list[tuple[str, str]] | None = None) -> str: + """Build the full prompt by injecting DDL into the template.""" + template = PROMPT_TEMPLATE_PATH.read_text() + + section_marker = "### MY TABLES" + idx = template.find(section_marker) + + cs_lines = "" + if catalog_schemas: + cs_lines = "Tables span these catalog.schema pairs:\n" + for cat, sch in catalog_schemas: + cs_lines += f" - {cat}.{sch}\n" + cs_lines += ( + "\nFor each fgac_policy, set catalog, function_catalog, and function_schema " + "to match the catalog.schema of the tables the policy applies to.\n" + ) + + if idx == -1: + print("WARNING: Could not find '### MY TABLES' in ABAC_PROMPT.md") + print(" Appending DDL at the end of the prompt instead.\n") + prompt = template + f"\n\n{cs_lines}\n\n{ddl_text}\n" + else: + prompt_body = template[:idx].rstrip() + user_input = ( + f"\n\n### MY TABLES\n\n" + f"{cs_lines}\n" + f"```sql\n{ddl_text}\n```\n" + ) + prompt = prompt_body + user_input + + return prompt + + +def extract_code_blocks(response_text: str) -> tuple[str | None, str | None]: + """Extract the SQL and HCL code blocks from the LLM response.""" + sql_block = None + hcl_block = None + + blocks = re.findall(r"```(\w*)\n(.*?)```", response_text, re.DOTALL) + + for lang, content in blocks: + content = content.strip() + lang_lower = lang.lower() + + if lang_lower == "sql" and sql_block is None: + sql_block = content + elif lang_lower in ("hcl", "terraform") and hcl_block is None: + hcl_block = content + elif not lang and sql_block is None and "CREATE" in content.upper() and "FUNCTION" in content.upper(): + sql_block = content + elif not lang and hcl_block is None and "groups" in content and "tag_policies" in content: + hcl_block = content + + return sql_block, hcl_block + + +TFVARS_STRIP_KEYS = { + "databricks_account_id", + "databricks_client_id", + "databricks_client_secret", + "databricks_workspace_id", + "databricks_workspace_host", + "uc_catalog_name", + "uc_schema_name", + "uc_tables", +} + + +def sanitize_tfvars_hcl(hcl_block: str) -> str: + """ + Make AI-generated tfvars easier and safer to use: + - Strip auth variables (these come from auth.auto.tfvars) + - Insert section-level explanations and doc links + """ + + # --- Strip auth fields (and common adjacent headers) --- + stripped_lines: list[str] = [] + for line in hcl_block.splitlines(): + if re.match(r"^\s*#\s*Authentication\b", line, re.IGNORECASE): + continue + if re.match(r"^\s*#\s*Databricks\s+Authentication\b", line, re.IGNORECASE): + continue + + m = re.match(r"^\s*([A-Za-z0-9_]+)\s*=", line) + if m and m.group(1) in TFVARS_STRIP_KEYS: + continue + + stripped_lines.append(line) + + # Collapse excessive blank lines + compact: list[str] = [] + last_blank = False + for line in stripped_lines: + blank = line.strip() == "" + if blank and last_blank: + continue + compact.append(line) + last_blank = blank + + text = "\n".join(compact).strip() + "\n" + + # --- Insert explanatory blocks before major sections --- + docs = ( + "# Docs:\n" + "# - Governed tags / tag policies: https://docs.databricks.com/en/database-objects/tags.html\n" + "# - Unity Catalog ABAC overview: https://docs.databricks.com/aws/en/data-governance/unity-catalog/abac\n" + "# - ABAC policies (masks + filters): https://docs.databricks.com/aws/en/data-governance/unity-catalog/abac/policies\n" + "# - Row filters + column masks: https://docs.databricks.com/en/tables/row-and-column-filters.html\n" + "#\n" + ) + + groups_block = ( + "# ----------------------------------------------------------------------------\n" + "# Groups (business roles)\n" + "# ----------------------------------------------------------------------------\n" + "# Keys are group names. Use these to represent business personas (e.g., Analyst,\n" + "# Researcher, Compliance). These groups are used for workspace onboarding,\n" + "# Databricks One consumer access, data grants, and optional Genie Space ACLs.\n" + "#\n" + + docs + ) + + tag_policies_block = ( + "# ----------------------------------------------------------------------------\n" + "# Tag policies (governed tags)\n" + "# ----------------------------------------------------------------------------\n" + "# Each entry defines a governed tag key and the allowed values. You’ll assign\n" + "# these tags to tables/columns below, then reference them in FGAC policies.\n" + "#\n" + + docs + ) + + tag_assignments_block = ( + "# ----------------------------------------------------------------------------\n" + "# Tag assignments (classify tables/columns)\n" + "# ----------------------------------------------------------------------------\n" + "# Apply governed tags to Unity Catalog objects.\n" + "# - entity_type: \"tables\" or \"columns\"\n" + "# - entity_name: fully qualified three-level name\n" + "# - table: \"catalog.schema.Table\"\n" + "# - column: \"catalog.schema.Table.Column\"\n" + "# - Table-level tags are optional; use them to scope column masks or row filters\n" + "# to specific tables, or for governance.\n" + "#\n" + + docs + ) + + fgac_block = ( + "# ----------------------------------------------------------------------------\n" + "# FGAC policies (who sees what, and how)\n" + "# ----------------------------------------------------------------------------\n" + "# Each entry creates either a COLUMN MASK or ROW FILTER policy.\n" + "#\n" + "# Common fields:\n" + "# - name: logical name for the policy (must be unique)\n" + "# - policy_type: POLICY_TYPE_COLUMN_MASK | POLICY_TYPE_ROW_FILTER\n" + "# - catalog: catalog this policy is scoped to\n" + "# - function_catalog: catalog where the masking UDF lives\n" + "# - function_schema: schema where the masking UDF lives\n" + "# - to_principals: list of group names who receive this policy\n" + "# - except_principals: optional list of groups excluded (break-glass/admin)\n" + "# - comment: human-readable intent (recommended)\n" + "#\n" + "# For COLUMN MASK:\n" + "# - match_condition: ABAC condition, e.g. hasTagValue('phi_level','full_phi')\n" + "# - match_alias: the column alias used by the ABAC engine\n" + "# - function_name: masking UDF name (relative; Terraform prefixes catalog.schema)\n" + "# - when_condition: (optional) scope to specific tagged tables\n" + "#\n" + "# For ROW FILTER:\n" + "# - when_condition: (optional) scope to specific tagged tables\n" + "# - function_name: row filter UDF name (relative; must be zero-argument)\n" + "#\n" + "# Example \u2014 column mask (mask SSN for analysts, exempt compliance):\n" + "# {\n" + "# name = \"mask_ssn_analysts\"\n" + "# policy_type = \"POLICY_TYPE_COLUMN_MASK\"\n" + "# to_principals = [\"Junior_Analyst\", \"Senior_Analyst\"]\n" + "# except_principals = [\"Compliance_Officer\"]\n" + "# comment = \"Mask SSN showing only last 4 digits\"\n" + "# match_condition = \"hasTagValue('pii_level', 'highly_sensitive')\"\n" + "# match_alias = \"masked_ssn\"\n" + "# function_name = \"mask_ssn\"\n" + "# }\n" + "#\n" + "# Example \u2014 row filter (restrict regional staff to their rows):\n" + "# {\n" + "# name = \"filter_us_region\"\n" + "# policy_type = \"POLICY_TYPE_ROW_FILTER\"\n" + "# to_principals = [\"US_Region_Staff\"]\n" + "# comment = \"Only show rows where region = US\"\n" + "# when_condition = \"hasTagValue('region_scope', 'global')\"\n" + "# function_name = \"filter_by_region_us\"\n" + "# }\n" + "#\n" + + docs + ) + + def insert_before(pattern: str, block: str, s: str) -> str: + # Avoid double-inserting if the block already exists nearby + if block.strip() in s: + return s + return re.sub(pattern, block + r"\g<0>", s, count=1, flags=re.MULTILINE) + + text = insert_before(r"^groups\s*=\s*\{", groups_block, text) + text = insert_before(r"^tag_policies\s*=\s*\[", tag_policies_block, text) + text = insert_before(r"^tag_assignments\s*=\s*\[", tag_assignments_block, text) + text = insert_before(r"^fgac_policies\s*=\s*\[", fgac_block, text) + + return text + + +def call_anthropic(prompt: str, model: str) -> str: + """Call Claude via the Anthropic API.""" + try: + import anthropic + except ImportError: + print("ERROR: anthropic package not installed. Run:") + print(" pip install anthropic") + sys.exit(2) + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print("ERROR: ANTHROPIC_API_KEY environment variable not set.") + print(" export ANTHROPIC_API_KEY='sk-ant-...'") + sys.exit(1) + + client = anthropic.Anthropic(api_key=api_key) + print(f" Calling Anthropic ({model})...") + + message = client.messages.create( + model=model, + max_tokens=8192, + messages=[{"role": "user", "content": prompt}], + ) + return message.content[0].text + + +def call_openai(prompt: str, model: str) -> str: + """Call GPT via the OpenAI API.""" + try: + import openai + except ImportError: + print("ERROR: openai package not installed. Run:") + print(" pip install openai") + sys.exit(2) + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("ERROR: OPENAI_API_KEY environment variable not set.") + print(" export OPENAI_API_KEY='sk-...'") + sys.exit(1) + + client = openai.OpenAI(api_key=api_key) + print(f" Calling OpenAI ({model})...") + + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a Databricks Unity Catalog ABAC expert."}, + {"role": "user", "content": prompt}, + ], + max_tokens=8192, + ) + return response.choices[0].message.content + + +def call_databricks(prompt: str, model: str) -> str: + """Call a model via the Databricks Foundation Model API.""" + try: + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.serving import ChatMessage, ChatMessageRole + except ImportError: + print("ERROR: databricks-sdk package not installed. Run:") + print(" pip install databricks-sdk") + sys.exit(2) + + from databricks.sdk.config import Config + + cfg = Config(http_timeout_seconds=600, product=PRODUCT_NAME, product_version=PRODUCT_VERSION) + w = WorkspaceClient(config=cfg) + print(f" Calling Databricks FMAPI ({model})...") + + response = w.serving_endpoints.query( + name=model, + messages=[ + ChatMessage(role=ChatMessageRole.SYSTEM, content="You are a Databricks Unity Catalog ABAC expert."), + ChatMessage(role=ChatMessageRole.USER, content=prompt), + ], + max_tokens=8192, + ) + return response.choices[0].message.content + + +PROVIDERS = { + "databricks": { + "call": call_databricks, + "default_model": "databricks-claude-sonnet-4", + }, + "anthropic": { + "call": call_anthropic, + "default_model": "claude-sonnet-4-20250514", + }, + "openai": { + "call": call_openai, + "default_model": "gpt-4o", + }, +} + + +class Spinner: + """Simple terminal spinner for long-running operations.""" + + FRAMES = "ā ‹ā ™ā ¹ā øā ¼ā “ā ¦ā §ā ‡ā " + + def __init__(self, message: str = "Working"): + self._message = message + self._stop = threading.Event() + self._thread: threading.Thread | None = None + self._start_time = 0.0 + + def __enter__(self): + self._start_time = time.time() + self._thread = threading.Thread(target=self._spin, daemon=True) + self._thread.start() + return self + + def __exit__(self, *_): + self._stop.set() + if self._thread: + self._thread.join() + elapsed = time.time() - self._start_time + sys.stderr.write(f"\r {self._message} — done ({elapsed:.1f}s)\n") + sys.stderr.flush() + + def _spin(self): + i = 0 + while not self._stop.is_set(): + elapsed = time.time() - self._start_time + frame = self.FRAMES[i % len(self.FRAMES)] + sys.stderr.write(f"\r {frame} {self._message} ({elapsed:.0f}s)") + sys.stderr.flush() + i += 1 + self._stop.wait(0.1) + + +def call_with_retries(call_fn, prompt: str, model: str, max_retries: int) -> str: + """Call an LLM provider with exponential backoff retries.""" + last_error = None + for attempt in range(1, max_retries + 1): + try: + with Spinner(f"Calling LLM (attempt {attempt}/{max_retries})"): + return call_fn(prompt, model) + except Exception as e: + last_error = e + if attempt < max_retries: + wait = min(2 ** attempt, 60) + print(f"\n Attempt {attempt} failed: {e}") + print(f" Retrying in {wait}s...") + time.sleep(wait) + else: + print(f"\n Attempt {attempt} failed: {e}") + raise RuntimeError(f"All {max_retries} attempts failed. Last error: {last_error}") + + +def autofix_tag_policies(tfvars_path: Path) -> int: + """Add tag values used in assignments/policies but missing from tag_policies.""" + text = tfvars_path.read_text() + + allowed: dict[str, list[str]] = {} + for m in re.finditer( + r'\{\s*key\s*=\s*"([^"]+)"[^}]*?values\s*=\s*\[([^\]]*)\]', + text, + re.DOTALL, + ): + allowed[m.group(1)] = re.findall(r'"([^"]+)"', m.group(2)) + + used: dict[str, set[str]] = {} + for m in re.finditer(r'tag_key\s*=\s*"([^"]+)"[^}]*?tag_value\s*=\s*"([^"]+)"', text, re.DOTALL): + used.setdefault(m.group(1), set()).add(m.group(2)) + for m in re.finditer(r"hasTagValue\(\s*'([^']+)'\s*,\s*'([^']+)'\s*\)", text): + used.setdefault(m.group(1), set()).add(m.group(2)) + + added_total = 0 + for key in used: + if key not in allowed: + continue + missing = sorted(used[key] - set(allowed[key])) + if not missing: + continue + old_vals = ", ".join(f'"{v}"' for v in allowed[key]) + new_vals = ", ".join(f'"{v}"' for v in allowed[key] + missing) + text = text.replace( + f'values = [{old_vals}]', + f'values = [{new_vals}]', + 1, + ) + allowed[key].extend(missing) + added_total += len(missing) + for val in missing: + print(f" [AUTOFIX] Added '{val}' to tag_policy '{key}'") + + if added_total: + tfvars_path.write_text(text) + + return added_total + + +def run_validation(out_dir: Path) -> bool: + """Run validate_abac.py on the generated files. Returns True if passed.""" + validator = SCRIPT_DIR / "validate_abac.py" + tfvars_path = out_dir / "abac.auto.tfvars" + sql_path = out_dir / "masking_functions.sql" + + if not validator.exists(): + print("\n [SKIP] validate_abac.py not found — skipping validation") + return True + + cmd = [sys.executable, str(validator), str(tfvars_path)] + if sql_path.exists(): + cmd.append(str(sql_path)) + + print("\n Running validation...\n") + result = subprocess.run(cmd, cwd=str(SCRIPT_DIR)) + return result.returncode == 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Generate ABAC configuration from table DDL using AI", + epilog=( + "Examples:\n" + " python generate_abac.py # reads uc_tables from env.auto.tfvars\n" + " python generate_abac.py --tables 'prod.sales.*' # CLI override\n" + " python generate_abac.py --promote # generate + validate + copy to root (legacy)\n" + " python generate_abac.py --dry-run # print prompt without calling LLM\n" + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--tables", nargs="+", metavar="CATALOG.SCHEMA.TABLE", + help="Fully-qualified table refs to fetch from Databricks " + "(overrides uc_tables in env.auto.tfvars). " + "E.g. prod.sales.customers or prod.sales.* for all tables in a schema", + ) + parser.add_argument("--catalog", help="Catalog for masking UDFs (auto-derived from first uc_tables entry if omitted)") + parser.add_argument("--schema", help="Schema for masking UDFs (auto-derived from first uc_tables entry if omitted)") + parser.add_argument( + "--auth-file", + default=str(DEFAULT_AUTH_FILE), + help="Path to auth tfvars file (default: auth.auto.tfvars)", + ) + parser.add_argument( + "--provider", + choices=list(PROVIDERS.keys()), + default="databricks", + help="LLM provider (default: databricks)", + ) + parser.add_argument("--model", help="Model name (defaults depend on provider)") + parser.add_argument( + "--ddl-dir", + default=str(SCRIPT_DIR / "ddl"), + help="Directory containing .sql DDL files (default: ./ddl/)", + ) + parser.add_argument( + "--out-dir", + default=str(SCRIPT_DIR / "generated"), + help="Output directory for generated files (default: ./generated/)", + ) + parser.add_argument("--max-retries", type=int, default=3, help="Max LLM call attempts with exponential backoff (default: 3)") + parser.add_argument("--skip-validation", action="store_true", help="Skip running validate_abac.py") + parser.add_argument("--promote", action="store_true", + help="Auto-copy generated files to module root after validation passes") + parser.add_argument("--dry-run", action="store_true", help="Build the prompt and print it without calling the LLM") + + args = parser.parse_args() + + ddl_dir = Path(args.ddl_dir) + out_dir = Path(args.out_dir) + auth_file = Path(args.auth_file) + + print("=" * 60) + print(" ABAC Configuration Generator") + print("=" * 60) + + auth_cfg = load_auth_config(auth_file) + + catalog = args.catalog or "" + schema = args.schema or "" + + catalog_schemas: list[tuple[str, str]] | None = None + + # Resolve table refs: CLI --tables overrides uc_tables from config + table_refs = args.tables or auth_cfg.get("uc_tables") or None + + if table_refs: + source = "--tables CLI" if args.tables else "uc_tables in auth config" + print(f" Provider: {args.provider}") + print(f" Out dir: {out_dir}") + print(f" Tables: {', '.join(table_refs)} (from {source})") + print() + + ddl_text, catalog_schemas = fetch_tables_from_databricks( + table_refs, auth_cfg, + ) + + if not catalog or not schema: + if not catalog_schemas: + print("ERROR: No tables found — cannot determine UDF deployment location.") + print(" Use --catalog and --schema to specify explicitly.") + sys.exit(1) + catalog = catalog or catalog_schemas[0][0] + schema = schema or catalog_schemas[0][1] + + if catalog_schemas and len(catalog_schemas) > 1: + print(" Masking UDFs will be deployed to:") + for cat, sch in catalog_schemas: + print(f" - {cat}.{sch}") + else: + print(f" Masking UDFs will be deployed to: {catalog}.{schema}") + + # Save fetched DDLs for inspection + ddl_dir.mkdir(parents=True, exist_ok=True) + fetched_path = ddl_dir / "_fetched.sql" + fetched_path.write_text(ddl_text + "\n") + print(f" Fetched DDLs saved to: {fetched_path}") + else: + # Legacy mode: read from ddl/ directory + if not catalog: + print("ERROR: --catalog is required when using DDL files (no uc_tables configured).") + sys.exit(1) + if not schema: + print("ERROR: --schema is required when using DDL files (no uc_tables configured).") + sys.exit(1) + + if not ddl_dir.exists(): + print(f"\nERROR: DDL directory '{ddl_dir}' does not exist.") + print(f" mkdir -p {ddl_dir}") + print(" # Then place your CREATE TABLE .sql files there") + sys.exit(1) + + print(f" Catalog: {catalog}") + print(f" Schema: {schema}") + print(f" Provider: {args.provider}") + print(f" DDL dir: {ddl_dir}") + print(f" Out dir: {out_dir}") + print() + + ddl_text = load_ddl_files(ddl_dir) + + prompt = build_prompt( + ddl_text, + catalog_schemas=catalog_schemas, + ) + + if args.dry_run: + print("=" * 60) + print(" DRY RUN — Prompt that would be sent:") + print("=" * 60) + print(prompt) + sys.exit(0) + + if args.provider == "databricks": + configure_databricks_env(auth_cfg) + + provider_cfg = PROVIDERS[args.provider] + model = args.model or provider_cfg["default_model"] + call_fn = provider_cfg["call"] + + response_text = call_with_retries(call_fn, prompt, model, args.max_retries) + + sql_block, hcl_block = extract_code_blocks(response_text) + + if not sql_block: + print("\nWARNING: Could not extract SQL code block from the response.") + print(" The full response will be saved to generated_response.md for manual extraction.\n") + if not hcl_block: + print("\nWARNING: Could not extract HCL code block from the response.") + print(" The full response will be saved to generated_response.md for manual extraction.\n") + + out_dir.mkdir(parents=True, exist_ok=True) + + response_path = out_dir / "generated_response.md" + response_path.write_text(response_text) + print(f"\n Full LLM response saved to: {response_path}") + + tuning_md = f"""# Review & Tune (Before Apply) + +This folder contains a **first draft** of: +- `masking_functions.sql` — masking UDFs + row filter functions +- `abac.auto.tfvars` — groups, tags, FGAC policies, and Genie Space config + +Before you apply, tune for your business roles, security requirements, and Genie accuracy: + +## Checklist — Genie Accuracy (review first) + +- **Benchmarks**: Each benchmark question must be **unambiguous and self-contained**. The natural-language question and its ground-truth SQL must agree on the exact scope — e.g., "What is the average risk score for **active** customers?" (not "What is the average customer risk score?"). Run benchmarks in the Genie UI after apply to verify accuracy. +- **SQL filters**: Do the default WHERE clauses match your business definitions? (e.g., "active customers" = `CustomerStatus = 'Active'`, "completed transactions" = `TransactionStatus = 'Completed'`). These filters guide Genie's SQL generation. +- **SQL measures**: Are the standard metrics correct? (e.g., total revenue = `SUM(Amount)`, average risk = `AVG(RiskScore)`). +- **SQL expressions**: Are the computed dimensions useful? (e.g., transaction year, age bucket). +- **Join specs**: Do the join conditions between tables use the correct keys? Incorrect joins cause wrong results across all multi-table queries. +- **Instructions**: Does the instruction text define business defaults (e.g., "customer" means active by default) and domain conventions (date handling, metric calculations)? + +## Checklist — ABAC & Masking + +- **Groups and personas**: Do the groups map to real business roles? +- **Sensitive columns**: Are the right columns tagged (PII/PHI/financial/etc.)? +- **Masking behavior**: Are you using the right approach (partial, redact, hash) per sensitivity and use case? +- **Row filters and exceptions**: Are filters too broad/strict? Are exceptions minimal and intentional? + +## Checklist — Genie Space Metadata + +- **Genie title & description**: Does the AI-generated title/description accurately represent the space? +- **Genie sample questions**: Do the sample questions reflect what business users will ask? +- **Validate before apply**: Run validation before `terraform apply`. + +## Suggested workflow + +1. Review and edit `masking_functions.sql` and `abac.auto.tfvars` in `generated/`. +2. Validate after each change: + ```bash + make validate-generated + ``` +3. When ready, apply (validates again, promotes to root, runs terraform): + ```bash + make apply + ``` + +""" + + tuning_path = out_dir / "TUNING.md" + tuning_path.write_text(tuning_md) + print(f" Tuning checklist written to: {tuning_path}") + + if sql_block: + all_cs = catalog_schemas if catalog_schemas else [(catalog, schema)] + targets = ", ".join(f"{c}.{s}" for c, s in all_cs) + sql_header = ( + "-- ============================================================================\n" + "-- GENERATED MASKING FUNCTIONS (FIRST DRAFT)\n" + "-- ============================================================================\n" + f"-- Target(s): {targets}\n" + "-- Next: review generated/TUNING.md, tune if needed, then run this SQL.\n" + "-- ============================================================================\n\n" + ) + + final_sql = sql_header + sql_block + sql_path = out_dir / "masking_functions.sql" + sql_path.write_text(final_sql + "\n") + print(f" masking_functions.sql written to: {sql_path}") + print(f" Target schemas: {targets}") + + if hcl_block: + hcl_header = ( + "# ============================================================================\n" + "# GENERATED ABAC CONFIG (FIRST DRAFT)\n" + "# ============================================================================\n" + "# NOTE: Authentication comes from auth.auto.tfvars, environment from env.auto.tfvars.\n" + "# Tune the following before apply:\n" + "# - groups (business roles)\n" + "# - tag_assignments (what data is considered sensitive)\n" + "# - fgac_policies (who sees what, and how)\n" + "# Then validate before copying to root:\n" + "# python validate_abac.py generated/abac.auto.tfvars generated/masking_functions.sql\n" + "# ============================================================================\n\n" + ) + + hcl_block = sanitize_tfvars_hcl(hcl_block) + tfvars_path = out_dir / "abac.auto.tfvars" + tfvars_path.write_text(hcl_header + hcl_block + "\n") + print(f" abac.auto.tfvars written to: {tfvars_path}") + + n_fixed = autofix_tag_policies(tfvars_path) + if n_fixed: + print(f" Auto-fixed {n_fixed} missing tag_policy value(s)") + + if sql_block and hcl_block and not args.skip_validation: + passed = run_validation(out_dir) + if not passed: + print("\n Validation found errors. Review the output above and fix before running terraform apply.") + sys.exit(1) + + if args.promote and passed: + promoted = [] + for fname in ["abac.auto.tfvars", "masking_functions.sql"]: + src = out_dir / fname + if src.exists(): + shutil.copy2(src, SCRIPT_DIR / fname) + promoted.append(fname) + if promoted: + print(f"\n Promoted to module root: {', '.join(promoted)}") + elif not args.skip_validation and (not sql_block or not hcl_block): + print("\n [SKIP] Validation skipped — could not extract both code blocks.") + print(f" Review {response_path} and manually extract the files.") + + print("\n" + "=" * 60) + print(" Done!") + if sql_block and hcl_block: + if args.promote: + print(" Files promoted to root. Next step:") + print(" make apply (or: terraform init && terraform apply -parallelism=1)") + else: + print(" Next steps:") + print(f" 1. Review the tuning checklist:") + print(f" {out_dir.resolve()}/TUNING.md") + print(f" 2. Review and tune generated files:") + print(f" {out_dir.resolve()}/masking_functions.sql") + print(f" {out_dir.resolve()}/abac.auto.tfvars") + print(" 3. make validate-generated (check your changes anytime)") + print(" 4. make apply (validates, promotes to root, runs terraform apply)") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/uc-quickstart/utils/genie/aws/genie_space.tf b/uc-quickstart/utils/genie/aws/genie_space.tf new file mode 100644 index 00000000..ec4f8865 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/genie_space.tf @@ -0,0 +1,98 @@ +# ============================================================================ +# Genie Space — dual-mode lifecycle +# ============================================================================ +# Mode 1 (existing): genie_space_id is set → set ACLs on the existing space. +# Mode 2 (greenfield): genie_space_id is empty → create a new space from +# uc_tables, set ACLs, and trash on destroy. +# ============================================================================ + +# -------------------------------------------------------------------------- +# Mode 1: ACLs on an existing Genie Space +# -------------------------------------------------------------------------- + +resource "null_resource" "genie_space_acls" { + count = var.genie_space_id != "" ? 1 : 0 + + triggers = { + space_id = var.genie_space_id + groups = join(",", keys(var.groups)) + } + + provisioner "local-exec" { + command = "${path.module}/scripts/genie_space.sh set-acls" + + environment = { + DATABRICKS_HOST = var.databricks_workspace_host + DATABRICKS_CLIENT_ID = var.databricks_client_id + DATABRICKS_CLIENT_SECRET = var.databricks_client_secret + GENIE_SPACE_OBJECT_ID = var.genie_space_id + GENIE_GROUPS_CSV = join(",", keys(var.groups)) + } + } + + depends_on = [ + databricks_group.groups, + databricks_mws_permission_assignment.group_assignments, + ] +} + +# -------------------------------------------------------------------------- +# Mode 2: Create a new Genie Space + set ACLs, trash on destroy +# -------------------------------------------------------------------------- + +resource "null_resource" "genie_space_create" { + count = var.genie_space_id == "" && length(var.uc_tables) > 0 ? 1 : 0 + + triggers = { + tables = join(",", var.uc_tables) + groups = join(",", keys(var.groups)) + warehouse_id = local.effective_warehouse_id + id_file = "${path.module}/.genie_space_id" + script = "${path.module}/scripts/genie_space.sh" + host = var.databricks_workspace_host + client_id = var.databricks_client_id + client_secret = var.databricks_client_secret + } + + provisioner "local-exec" { + command = "${self.triggers.script} create" + + environment = { + DATABRICKS_HOST = self.triggers.host + DATABRICKS_CLIENT_ID = self.triggers.client_id + DATABRICKS_CLIENT_SECRET = self.triggers.client_secret + GENIE_TABLES_CSV = self.triggers.tables + GENIE_GROUPS_CSV = self.triggers.groups + GENIE_WAREHOUSE_ID = self.triggers.warehouse_id + GENIE_TITLE = var.genie_space_title + GENIE_DESCRIPTION = var.genie_space_description + GENIE_SAMPLE_QUESTIONS = length(var.genie_sample_questions) > 0 ? jsonencode(var.genie_sample_questions) : "" + GENIE_INSTRUCTIONS = var.genie_instructions + GENIE_BENCHMARKS = length(var.genie_benchmarks) > 0 ? jsonencode(var.genie_benchmarks) : "" + GENIE_SQL_FILTERS = length(var.genie_sql_filters) > 0 ? jsonencode(var.genie_sql_filters) : "" + GENIE_SQL_EXPRESSIONS = length(var.genie_sql_expressions) > 0 ? jsonencode(var.genie_sql_expressions) : "" + GENIE_SQL_MEASURES = length(var.genie_sql_measures) > 0 ? jsonencode(var.genie_sql_measures) : "" + GENIE_JOIN_SPECS = length(var.genie_join_specs) > 0 ? jsonencode(var.genie_join_specs) : "" + GENIE_ID_FILE = self.triggers.id_file + } + } + + provisioner "local-exec" { + when = destroy + command = "${self.triggers.script} trash" + + environment = { + DATABRICKS_HOST = self.triggers.host + DATABRICKS_CLIENT_ID = self.triggers.client_id + DATABRICKS_CLIENT_SECRET = self.triggers.client_secret + GENIE_ID_FILE = self.triggers.id_file + } + } + + depends_on = [ + databricks_group.groups, + databricks_mws_permission_assignment.group_assignments, + databricks_sql_endpoint.warehouse, + null_resource.deploy_masking_functions, + ] +} diff --git a/uc-quickstart/utils/genie/aws/group_members.tf b/uc-quickstart/utils/genie/aws/group_members.tf new file mode 100644 index 00000000..592815fe --- /dev/null +++ b/uc-quickstart/utils/genie/aws/group_members.tf @@ -0,0 +1,35 @@ +# ============================================================================ +# Group Memberships (data-driven) +# ============================================================================ +# Adds users to groups based on var.group_members. +# Map of group name -> list of account-level user IDs. +# ============================================================================ + +locals { + group_member_pairs = flatten([ + for group, members in var.group_members : [ + for member_id in members : { + group = group + member_id = member_id + } + ] + ]) + + group_member_map = { + for pair in local.group_member_pairs : + "${pair.group}|${pair.member_id}" => pair + } +} + +resource "databricks_group_member" "members" { + for_each = local.group_member_map + + provider = databricks.account + group_id = databricks_group.groups[each.value.group].id + member_id = each.value.member_id + + depends_on = [ + databricks_group.groups, + databricks_mws_permission_assignment.group_assignments, + ] +} diff --git a/uc-quickstart/utils/genie/aws/import_ids.env.example b/uc-quickstart/utils/genie/aws/import_ids.env.example new file mode 100644 index 00000000..56bee87e --- /dev/null +++ b/uc-quickstart/utils/genie/aws/import_ids.env.example @@ -0,0 +1,10 @@ +# Copy to import_ids.env and fill in IDs so scripts/import_existing.sh can adopt existing resources. +# Get warehouse ID: workspace → SQL → Warehouses → open "Genie Finance Warehouse" → ID from URL/details. +# Get group IDs: Account Console → Identity and access → Groups → open each group → ID. + +# WAREHOUSE_ID=abc123def456 +# GROUP_ID_Junior_Analyst=12345678 +# GROUP_ID_Senior_Analyst=23456789 +# GROUP_ID_US_Region_Staff=34567890 +# GROUP_ID_EU_Region_Staff=45678901 +# GROUP_ID_Compliance_Officer=56789012 diff --git a/uc-quickstart/utils/genie/aws/main.tf b/uc-quickstart/utils/genie/aws/main.tf new file mode 100644 index 00000000..e5b3aadd --- /dev/null +++ b/uc-quickstart/utils/genie/aws/main.tf @@ -0,0 +1,46 @@ +# ============================================================================ +# ABAC Account Groups - Generic Terraform Configuration +# ============================================================================ +# Creates account-level groups, assigns them to a workspace, and grants +# consumer entitlements. Groups are driven entirely by var.groups. +# ============================================================================ + +# ---------------------------------------------------------------------------- +# Create Account-Level Groups +# ---------------------------------------------------------------------------- + +resource "databricks_group" "groups" { + for_each = var.groups + + provider = databricks.account + display_name = each.key +} + +# ---------------------------------------------------------------------------- +# Assign Groups to Workspace +# ---------------------------------------------------------------------------- + +resource "databricks_mws_permission_assignment" "group_assignments" { + for_each = databricks_group.groups + + provider = databricks.account + workspace_id = var.databricks_workspace_id + principal_id = each.value.id + permissions = ["USER"] +} + +# ---------------------------------------------------------------------------- +# Grant Consumer Entitlements (Databricks One UI only) +# ---------------------------------------------------------------------------- +# workspace_consume cannot be combined with workspace_access or databricks_sql_access. + +resource "databricks_entitlements" "group_entitlements" { + for_each = databricks_group.groups + + provider = databricks.workspace + group_id = each.value.id + + workspace_consume = true + + depends_on = [databricks_mws_permission_assignment.group_assignments] +} diff --git a/uc-quickstart/utils/genie/aws/masking_functions.tf b/uc-quickstart/utils/genie/aws/masking_functions.tf new file mode 100644 index 00000000..2ad92a44 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/masking_functions.tf @@ -0,0 +1,48 @@ +# ============================================================================ +# Masking Functions Deployment +# ============================================================================ +# Executes masking_functions.sql via the Databricks Statement Execution API +# before FGAC policies are created. Uses local.effective_warehouse_id which +# is either the user-provided sql_warehouse_id or an auto-created warehouse. +# +# Re-runs automatically when the SQL file content changes (filemd5 trigger). +# CREATE OR REPLACE FUNCTION is idempotent, so re-execution is safe. +# ============================================================================ + +resource "null_resource" "deploy_masking_functions" { + triggers = { + sql_hash = filemd5("masking_functions.sql") + sql_file = "${path.module}/masking_functions.sql" + script = "${path.module}/deploy_masking_functions.py" + warehouse_id = local.effective_warehouse_id + host = var.databricks_workspace_host + client_id = var.databricks_client_id + client_secret = var.databricks_client_secret + } + + provisioner "local-exec" { + command = "python3 ${self.triggers.script} --sql-file ${self.triggers.sql_file} --warehouse-id ${self.triggers.warehouse_id}" + + environment = { + DATABRICKS_HOST = self.triggers.host + DATABRICKS_CLIENT_ID = self.triggers.client_id + DATABRICKS_CLIENT_SECRET = self.triggers.client_secret + } + } + + provisioner "local-exec" { + when = destroy + command = "python3 ${self.triggers.script} --sql-file ${self.triggers.sql_file} --warehouse-id ${self.triggers.warehouse_id} --drop" + + environment = { + DATABRICKS_HOST = self.triggers.host + DATABRICKS_CLIENT_ID = self.triggers.client_id + DATABRICKS_CLIENT_SECRET = self.triggers.client_secret + } + } + + depends_on = [ + time_sleep.wait_for_tag_propagation, + databricks_sql_endpoint.warehouse, + ] +} diff --git a/uc-quickstart/utils/genie/aws/masking_functions_library.sql b/uc-quickstart/utils/genie/aws/masking_functions_library.sql new file mode 100644 index 00000000..41552620 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/masking_functions_library.sql @@ -0,0 +1,240 @@ +-- ============================================================================ +-- REUSABLE MASKING FUNCTIONS LIBRARY +-- ============================================================================ +-- A categorized library of masking UDFs for Unity Catalog ABAC. +-- Pick the functions you need, find-replace {catalog}.{schema} with your own, +-- then execute only the selected functions in your Databricks workspace. +-- +-- Categories: +-- PII : Personal identifiable information masking +-- Financial : Credit card, account number, monetary amounts +-- Health : Medical record numbers, diagnosis codes +-- General : Redact, hash, nullify utilities +-- Row Filters: Region-based, time-based, audit filters +-- ============================================================================ + +USE CATALOG {catalog}; +USE SCHEMA {schema}; + +-- ============================================================================ +-- PII MASKING FUNCTIONS +-- ============================================================================ + +-- Partial PII masking: show first and last character, mask the middle. +-- Input: "John" -> "J**n" +-- Input: "alice@x.com" -> "a*********m" +CREATE OR REPLACE FUNCTION mask_pii_partial(input STRING) +RETURNS STRING +COMMENT 'Partial PII masking — first and last character visible, middle masked. Use for names, addresses, or any short PII string.' +RETURN CASE + WHEN input IS NULL OR input = '' THEN input + WHEN LENGTH(input) <= 2 THEN REPEAT('*', LENGTH(input)) + WHEN LENGTH(input) = 3 THEN CONCAT(LEFT(input, 1), '*', RIGHT(input, 1)) + ELSE CONCAT(LEFT(input, 1), REPEAT('*', LENGTH(input) - 2), RIGHT(input, 1)) +END; + +-- SSN masking: show last 4 digits. +-- Input: "123-45-6789" -> "XXX-XX-6789" +CREATE OR REPLACE FUNCTION mask_ssn(ssn STRING) +RETURNS STRING +COMMENT 'Mask SSN showing only last 4 digits. Use for US Social Security Numbers (GLBA/CCPA).' +RETURN CASE + WHEN ssn IS NULL OR ssn = '' THEN ssn + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) = 9 THEN + CONCAT('XXX-XX-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE 'XXX-XX-XXXX' +END; + +-- Email masking: preserve domain, mask local part. +-- Input: "john.doe@example.com" -> "****@example.com" +CREATE OR REPLACE FUNCTION mask_email(email STRING) +RETURNS STRING +COMMENT 'Mask email local part, preserve domain. Use for GDPR/privacy-compliant email display.' +RETURN CASE + WHEN email IS NULL OR email = '' THEN email + WHEN LOCATE('@', email) > 0 THEN + CONCAT('****', SUBSTRING(email, LOCATE('@', email))) + ELSE '****' +END; + +-- Phone number masking: show last 4 digits. +-- Input: "+1-555-123-4567" -> "***-***-4567" +CREATE OR REPLACE FUNCTION mask_phone(phone STRING) +RETURNS STRING +COMMENT 'Mask phone number showing only last 4 digits.' +RETURN CASE + WHEN phone IS NULL OR phone = '' THEN phone + WHEN LENGTH(REGEXP_REPLACE(phone, '[^0-9]', '')) >= 4 THEN + CONCAT('***-***-', RIGHT(REGEXP_REPLACE(phone, '[^0-9]', ''), 4)) + ELSE '***-***-****' +END; + +-- Full name masking: first initial + last initial. +-- Input: "John Doe" -> "J. D." +CREATE OR REPLACE FUNCTION mask_full_name(name STRING) +RETURNS STRING +COMMENT 'Reduce full name to initials. Use for anonymized reporting.' +RETURN CASE + WHEN name IS NULL OR name = '' THEN name + WHEN LOCATE(' ', name) > 0 THEN + CONCAT(LEFT(name, 1), '. ', LEFT(SUBSTRING(name, LOCATE(' ', name) + 1), 1), '.') + ELSE CONCAT(LEFT(name, 1), '.') +END; + +-- ============================================================================ +-- FINANCIAL MASKING FUNCTIONS +-- ============================================================================ + +-- Full credit card masking. +-- Input: "4532-1234-5678-9010" -> "XXXX-XXXX-XXXX-XXXX" +CREATE OR REPLACE FUNCTION mask_credit_card_full(card_number STRING) +RETURNS STRING +COMMENT 'Full credit card masking for PCI-DSS compliance. All digits hidden.' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- Credit card last 4 digits visible. +-- Input: "4532-1234-5678-9010" -> "XXXX-XXXX-XXXX-9010" +CREATE OR REPLACE FUNCTION mask_credit_card_last4(card_number STRING) +RETURNS STRING +COMMENT 'Show last 4 digits of credit card. Use for customer verification (PCI-DSS).' +RETURN CASE + WHEN card_number IS NULL OR card_number = '' THEN card_number + WHEN LENGTH(REGEXP_REPLACE(card_number, '[^0-9]', '')) >= 4 THEN + CONCAT('XXXX-XXXX-XXXX-', RIGHT(REGEXP_REPLACE(card_number, '[^0-9]', ''), 4)) + ELSE 'XXXX-XXXX-XXXX-XXXX' +END; + +-- Account number tokenization (deterministic hash). +-- Input: "ACC123456" -> "ACCT_a3f9c2b1e8d7" +CREATE OR REPLACE FUNCTION mask_account_number(account_id STRING) +RETURNS STRING +COMMENT 'Deterministic account number tokenization via SHA-256. Preserves join capability across tables.' +RETURN CASE + WHEN account_id IS NULL OR account_id = '' THEN account_id + ELSE CONCAT('ACCT_', LEFT(SHA2(account_id, 256), 12)) +END; + +-- Transaction amount rounding. +-- Input: 1234.56 -> 1200.00 +-- Input: 42.50 -> 40.00 +CREATE OR REPLACE FUNCTION mask_amount_rounded(amount DECIMAL(18,2)) +RETURNS DECIMAL(18,2) +COMMENT 'Round amounts to nearest 10 (< $100) or 100 (>= $100). Use for aggregated analytics.' +RETURN CASE + WHEN amount IS NULL THEN NULL + WHEN amount < 100 THEN ROUND(amount, -1) + ELSE ROUND(amount, -2) +END; + +-- IBAN masking: show country code + last 4. +-- Input: "DE89370400440532013000" -> "DE**************3000" +CREATE OR REPLACE FUNCTION mask_iban(iban STRING) +RETURNS STRING +COMMENT 'Mask IBAN showing country code and last 4 digits. Use for EU banking compliance.' +RETURN CASE + WHEN iban IS NULL OR iban = '' THEN iban + WHEN LENGTH(iban) > 6 THEN + CONCAT(LEFT(iban, 2), REPEAT('*', LENGTH(iban) - 6), RIGHT(iban, 4)) + ELSE REPEAT('*', LENGTH(iban)) +END; + +-- ============================================================================ +-- HEALTH MASKING FUNCTIONS +-- ============================================================================ + +-- Medical Record Number masking. +-- Input: "MRN-12345678" -> "MRN-****5678" +CREATE OR REPLACE FUNCTION mask_mrn(mrn STRING) +RETURNS STRING +COMMENT 'Mask medical record number showing only last 4 digits. Use for HIPAA compliance.' +RETURN CASE + WHEN mrn IS NULL OR mrn = '' THEN mrn + WHEN LENGTH(mrn) > 4 THEN + CONCAT(REPEAT('*', LENGTH(mrn) - 4), RIGHT(mrn, 4)) + ELSE REPEAT('*', LENGTH(mrn)) +END; + +-- ICD/diagnosis code masking: show category, hide specifics. +-- Input: "E11.65" -> "E11.XX" +CREATE OR REPLACE FUNCTION mask_diagnosis_code(code STRING) +RETURNS STRING +COMMENT 'Mask diagnosis code sub-category. Shows ICD category but hides specifics for de-identification.' +RETURN CASE + WHEN code IS NULL OR code = '' THEN code + WHEN LOCATE('.', code) > 0 THEN + CONCAT(SUBSTRING(code, 1, LOCATE('.', code)), 'XX') + ELSE code +END; + +-- ============================================================================ +-- GENERAL UTILITY MASKING FUNCTIONS +-- ============================================================================ + +-- Full redaction: replace with a fixed string. +-- Input: "anything" -> "[REDACTED]" +CREATE OR REPLACE FUNCTION mask_redact(input STRING) +RETURNS STRING +COMMENT 'Full redaction — replaces any value with [REDACTED]. Use for maximum restriction.' +RETURN CASE + WHEN input IS NULL THEN NULL + ELSE '[REDACTED]' +END; + +-- Deterministic hash: SHA-256 for consistent pseudonymization. +-- Input: "john@x.com" -> "a7f3c9e2b1d4..." +CREATE OR REPLACE FUNCTION mask_hash(input STRING) +RETURNS STRING +COMMENT 'SHA-256 deterministic hash. Use for pseudonymization that preserves join capability.' +RETURN CASE + WHEN input IS NULL OR input = '' THEN input + ELSE SHA2(input, 256) +END; + +-- Nullify: return NULL regardless of input. +-- Input: "anything" -> NULL +CREATE OR REPLACE FUNCTION mask_nullify(input STRING) +RETURNS STRING +COMMENT 'Return NULL for any input. Use when the column should be completely invisible.' +RETURN NULL; + +-- ============================================================================ +-- ROW FILTER FUNCTIONS (zero-argument for Unity Catalog ABAC) +-- ============================================================================ +-- UC row filter policies require zero-argument functions. +-- The policy's WHEN clause controls which tables the filter applies to. + +-- Regional filter — US data only. +CREATE OR REPLACE FUNCTION filter_by_region_us() +RETURNS BOOLEAN +COMMENT 'Row filter: restrict to US customer data (CCPA/GLBA). Apply via WHEN hasTagValue on region tag.' +RETURN TRUE; + +-- Regional filter — EU data only. +CREATE OR REPLACE FUNCTION filter_by_region_eu() +RETURNS BOOLEAN +COMMENT 'Row filter: restrict to EU customer data (GDPR). Apply via WHEN hasTagValue on region tag.' +RETURN TRUE; + +-- Regional filter — APAC data only. +CREATE OR REPLACE FUNCTION filter_by_region_apac() +RETURNS BOOLEAN +COMMENT 'Row filter: restrict to APAC customer data (PDPA). Apply via WHEN hasTagValue on region tag.' +RETURN TRUE; + +-- Trading hours filter. +CREATE OR REPLACE FUNCTION filter_trading_hours() +RETURNS BOOLEAN +COMMENT 'Row filter: restrict access to outside NYSE trading hours (9:30 AM - 4:00 PM ET).' +RETURN CASE + WHEN hour(current_timestamp()) < 14 OR hour(current_timestamp()) >= 21 THEN TRUE + ELSE FALSE +END; + +-- Audit expiry filter. +CREATE OR REPLACE FUNCTION filter_audit_expiry() +RETURNS BOOLEAN +COMMENT 'Row filter: temporary access for external auditors. Apply via WHEN hasTagValue on audit tag.' +RETURN TRUE; diff --git a/uc-quickstart/utils/genie/aws/outputs.tf b/uc-quickstart/utils/genie/aws/outputs.tf new file mode 100644 index 00000000..40f89c9a --- /dev/null +++ b/uc-quickstart/utils/genie/aws/outputs.tf @@ -0,0 +1,64 @@ +# ============================================================================ +# Outputs +# ============================================================================ + +output "group_ids" { + description = "Map of group names to their Databricks group IDs" + value = { + for name, group in databricks_group.groups : name => group.id + } +} + +output "group_names" { + description = "List of all created group names" + value = keys(databricks_group.groups) +} + +output "workspace_assignments" { + description = "Map of group names to their workspace assignment IDs" + value = { + for name, assignment in databricks_mws_permission_assignment.group_assignments : name => assignment.id + } +} + +output "group_entitlements" { + description = "Summary of entitlements granted to each group" + value = { + for name, entitlement in databricks_entitlements.group_entitlements : name => { + workspace_consume = entitlement.workspace_consume + } + } +} + +# ---------------------------------------------------------------------------- +# SQL warehouse (provided or auto-created) +# ---------------------------------------------------------------------------- + +output "sql_warehouse_id" { + description = "Effective SQL warehouse ID (user-provided or auto-created)." + value = local.effective_warehouse_id +} + +output "genie_space_acls_applied" { + description = "Whether Genie Space ACLs were applied (existing or newly created space)" + value = length(null_resource.genie_space_acls) > 0 || length(null_resource.genie_space_create) > 0 +} + +output "genie_space_acls_groups" { + description = "Groups that were granted CAN_RUN on the Genie Space" + value = ( + length(null_resource.genie_space_acls) > 0 || length(null_resource.genie_space_create) > 0 + ? keys(var.groups) + : [] + ) +} + +output "genie_space_created" { + description = "Whether a new Genie Space was auto-created by Terraform" + value = length(null_resource.genie_space_create) > 0 +} + +output "genie_groups_csv" { + description = "Comma-separated group names for genie_space.sh" + value = join(",", keys(var.groups)) +} diff --git a/uc-quickstart/utils/genie/aws/provider.tf b/uc-quickstart/utils/genie/aws/provider.tf new file mode 100644 index 00000000..b70f5671 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/provider.tf @@ -0,0 +1,51 @@ +# ============================================================================ +# Terraform Provider Configuration for Finance ABAC Groups +# ============================================================================ + +terraform { + required_providers { + databricks = { + source = "databricks/databricks" + version = "~> 1.91.0" + } + null = { + source = "hashicorp/null" + version = "~> 3.2" + } + time = { + source = "hashicorp/time" + version = "~> 0.12" + } + } + required_version = ">= 1.0" +} + +# ---------------------------------------------------------------------------- +# Databricks Account-Level Provider +# ---------------------------------------------------------------------------- +# This provider is configured for account-level operations (creating groups) + +provider "databricks" { + alias = "account" + host = "https://accounts.cloud.databricks.com" + account_id = var.databricks_account_id + client_id = var.databricks_client_id + client_secret = var.databricks_client_secret +} + +# ---------------------------------------------------------------------------- +# Databricks Workspace-Level Provider +# ---------------------------------------------------------------------------- +# This provider is configured for workspace-level operations (entitlements) +# +# IMPORTANT: The service principal must be added to the workspace with admin +# permissions to manage entitlements. You can do this via: +# - Account Console → Workspaces → [workspace] → Permissions → Add service principal +# - Or use databricks_mws_permission_assignment with ADMIN permissions + +provider "databricks" { + alias = "workspace" + host = var.databricks_workspace_host + client_id = var.databricks_client_id + client_secret = var.databricks_client_secret +} diff --git a/uc-quickstart/utils/genie/aws/scripts/genie_space.sh b/uc-quickstart/utils/genie/aws/scripts/genie_space.sh new file mode 100755 index 00000000..af25ab69 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/scripts/genie_space.sh @@ -0,0 +1,592 @@ +#!/usr/bin/env bash +# ============================================================================= +# Genie Space: create / set-acls / trash +# ============================================================================= +# Commands: +# create Create a Genie Space with configured tables and set ACLs. +# Wildcards (catalog.schema.*) are expanded via the UC Tables API. +# (POST /api/2.0/genie/spaces, then PUT permissions for groups). +# set-acls Set CAN_RUN on an existing Genie Space for the configured groups. +# trash Move a Genie Space to trash. Reads space_id from GENIE_ID_FILE. +# +# Authentication (in order of precedence): +# 1. DATABRICKS_TOKEN (PAT) - if set, used directly +# 2. DATABRICKS_CLIENT_ID + DATABRICKS_CLIENT_SECRET (Service Principal OAuth M2M) +# - Requires DATABRICKS_HOST to be set for token endpoint +# +# Configuration: +# GENIE_GROUPS_CSV Required for create/set-acls. Comma-separated group names. +# GENIE_TABLES_CSV Required for create. Comma-separated fully-qualified +# table names (catalog.schema.table). Wildcards (catalog.schema.*) +# are expanded via the UC Tables API. +# GENIE_WAREHOUSE_ID Warehouse ID for create. Falls back to sql_warehouse_id +# in env.auto.tfvars if not set. +# GENIE_TITLE Optional. Title for the new Genie Space (default: "ABAC Genie Space"). +# GENIE_DESCRIPTION Optional. Description for the new Genie Space. +# GENIE_SAMPLE_QUESTIONS Optional. JSON array of sample question strings. +# GENIE_INSTRUCTIONS Optional. Text instructions for the Genie LLM. +# GENIE_BENCHMARKS Optional. JSON array of {question, sql} objects. +# GENIE_SQL_FILTERS Optional. JSON array of {sql, display_name, comment, instruction}. +# GENIE_SQL_EXPRESSIONS Optional. JSON array of {alias, sql, display_name, comment, instruction}. +# GENIE_SQL_MEASURES Optional. JSON array of {alias, sql, display_name, comment, instruction}. +# GENIE_JOIN_SPECS Optional. JSON array of {left_table, left_alias, right_table, right_alias, sql, comment, instruction}. +# GENIE_ID_FILE Optional. File path to save the created space ID +# (used by Terraform for lifecycle management). +# +# Usage: +# ./genie_space.sh create [workspace_url] [token] [title] [warehouse_id] +# ./genie_space.sh set-acls [workspace_url] [token] [space_id] +# ./genie_space.sh trash +# +# Or set env and run: ./genie_space.sh create or ./genie_space.sh set-acls +# Re-running create adds a new space each time (not idempotent). +# ============================================================================= + +set -e + +UA_HEADER="User-Agent: genierails/0.1.0" + +usage() { + echo "Usage: $0 create [workspace_url] [token] [title] [warehouse_id]" + echo " $0 set-acls [workspace_url] [token] [space_id]" + echo " $0 trash" + echo " Or set DATABRICKS_HOST + DATABRICKS_TOKEN (or DATABRICKS_CLIENT_ID + DATABRICKS_CLIENT_SECRET)" + echo " For create: set GENIE_WAREHOUSE_ID; for set-acls: set GENIE_SPACE_OBJECT_ID" + exit 1 +} + +# ---------- Get OAuth token from Service Principal credentials ---------- +get_sp_token() { + local workspace_url="$1" + local client_id="$2" + local client_secret="$3" + workspace_url="${workspace_url%/}" + + local token_endpoint="${workspace_url}/oidc/v1/token" + + local response + response=$(curl -s -w "\n%{http_code}" -X POST \ + -H "${UA_HEADER}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials&client_id=${client_id}&client_secret=${client_secret}&scope=all-apis" \ + "${token_endpoint}") + + local http_code + http_code=$(echo "$response" | tail -n1) + local response_body + response_body=$(echo "$response" | sed '$d') + + if [[ "$http_code" != "200" ]]; then + echo "Failed to get OAuth token (HTTP ${http_code}). Check client_id/client_secret and workspace URL." >&2 + echo "Response: ${response_body}" >&2 + return 1 + fi + + local token + token=$(echo "$response_body" | grep -o '"access_token"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/.*"\([^"]*\)"$/\1/') + if [[ -z "$token" ]]; then + token=$(echo "$response_body" | jq -r '.access_token // empty' 2>/dev/null) + fi + + if [[ -z "$token" ]]; then + echo "Could not parse access_token from OAuth response." >&2 + return 1 + fi + + echo "$token" +} + +# ---------- Resolve token: use DATABRICKS_TOKEN or get from SP credentials ---------- +resolve_token() { + local workspace_url="$1" + local explicit_token="$2" + + if [[ -n "$explicit_token" ]]; then + echo "$explicit_token" + return 0 + fi + + if [[ -n "${DATABRICKS_TOKEN:-}" ]]; then + echo "$DATABRICKS_TOKEN" + return 0 + fi + + if [[ -n "${DATABRICKS_CLIENT_ID:-}" && -n "${DATABRICKS_CLIENT_SECRET:-}" ]]; then + echo "Using Service Principal OAuth M2M authentication..." >&2 + get_sp_token "$workspace_url" "$DATABRICKS_CLIENT_ID" "$DATABRICKS_CLIENT_SECRET" + return $? + fi + + echo "No authentication found. Set DATABRICKS_TOKEN or DATABRICKS_CLIENT_ID + DATABRICKS_CLIENT_SECRET." >&2 + return 1 +} + +# ---------- Read sql_warehouse_id from env.auto.tfvars (fallback) ---------- +read_warehouse_from_tfvars() { + local script_dir + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local tfvars="${script_dir}/../env.auto.tfvars" + if [[ -f "$tfvars" ]]; then + grep -E '^\s*sql_warehouse_id\s*=' "$tfvars" \ + | sed 's/.*=\s*"\(.*\)".*/\1/' \ + | head -1 + fi +} + +# ---------- Expand wildcard table entries via UC Tables API ---------- +expand_tables() { + local workspace_url="$1" + local token="$2" + local tables_csv="$3" + workspace_url="${workspace_url%/}" + + IFS=',' read -ra RAW_ENTRIES <<< "$tables_csv" + local expanded=() + + for entry in "${RAW_ENTRIES[@]}"; do + entry=$(echo "$entry" | xargs) # trim whitespace + if [[ "$entry" == *.* && "$entry" == *.\* ]]; then + # Wildcard: catalog.schema.* + local catalog schema + catalog=$(echo "$entry" | cut -d. -f1) + schema=$(echo "$entry" | cut -d. -f2) + echo "Expanding wildcard ${entry} via UC Tables API..." >&2 + + local api_url="${workspace_url}/api/2.1/unity-catalog/tables?catalog_name=${catalog}&schema_name=${schema}" + local resp + resp=$(curl -s -H "${UA_HEADER}" -H "Authorization: Bearer ${token}" "${api_url}") + + local table_names + table_names=$(echo "$resp" | grep -o '"full_name"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/.*"\([^"]*\)"$/\1/') + if [[ -z "$table_names" ]]; then + table_names=$(echo "$resp" | jq -r '.tables[]?.full_name // empty' 2>/dev/null) + fi + + if [[ -z "$table_names" ]]; then + echo "WARNING: No tables found for ${catalog}.${schema}.* — skipping wildcard." >&2 + continue + fi + + while IFS= read -r tbl; do + [[ -n "$tbl" ]] && expanded+=("$tbl") + done <<< "$table_names" + echo " Expanded to ${#expanded[@]} table(s) from ${catalog}.${schema}" >&2 + else + expanded+=("$entry") + fi + done + + local IFS=',' + echo "${expanded[*]}" +} + +# ---------- Set ACLs on a Genie Space (CAN_RUN for configured groups) ---------- +set_genie_acls() { + local workspace_url="$1" + local token="$2" + local space_id="$3" + workspace_url="${workspace_url%/}" + + IFS=',' read -ra GENIE_GROUPS <<< "${GENIE_GROUPS_CSV}" + + local access_control="" + for g in "${GENIE_GROUPS[@]}"; do + access_control="${access_control}{\"group_name\": \"${g}\", \"permission_level\": \"CAN_RUN\"}," + done + access_control="[${access_control%,}]" + + local body="{\"access_control_list\": ${access_control}}" + local path="/api/2.0/permissions/genie/${space_id}" + + echo "Putting permissions on Genie Space ${space_id} for groups: ${GENIE_GROUPS[*]}" + local response + response=$(curl -s -w "\n%{http_code}" -X PUT \ + -H "${UA_HEADER}" \ + -H "Authorization: Bearer ${token}" \ + -H "Content-Type: application/json" \ + -d "${body}" \ + "${workspace_url}${path}") + + local http_code + http_code=$(echo "$response" | tail -n1) + local response_body + response_body=$(echo "$response" | sed '$d') + + if [[ "$http_code" != "200" && "$http_code" != "201" ]]; then + echo "Request failed (HTTP ${http_code}). Check workspace URL, token, and Genie Space ID." + echo "API response: ${response_body}" + exit 1 + fi + echo "Genie Space ACLs updated successfully." +} + +# ---------- Create Genie Space with configured tables then set ACLs ---------- +create_genie_space() { + local workspace_url="$1" + local token="$2" + local title="${3:-${GENIE_TITLE:-ABAC Genie Space}}" + local warehouse_id="$4" + workspace_url="${workspace_url%/}" + + if [[ -z "${GENIE_TABLES_CSV:-}" ]]; then + echo "ERROR: GENIE_TABLES_CSV not set. Pass comma-separated fully-qualified table names." >&2 + echo " Example: GENIE_TABLES_CSV='cat.schema.t1,cat.schema.t2' $0 create" >&2 + exit 1 + fi + + # Expand wildcards before building the API payload + local resolved_csv + resolved_csv=$(expand_tables "$workspace_url" "$token" "$GENIE_TABLES_CSV") + IFS=',' read -ra TABLE_LIST <<< "$resolved_csv" + + local sorted_identifiers=() + while IFS= read -r id; do + [[ -n "$id" ]] && sorted_identifiers+=("$id") + done < <(printf '%s\n' "${TABLE_LIST[@]}" | LC_ALL=C sort) + + if [[ ${#sorted_identifiers[@]} -eq 0 ]]; then + echo "ERROR: No tables resolved after wildcard expansion. Nothing to create." >&2 + exit 1 + fi + + local tables_csv + tables_csv=$(IFS=','; echo "${sorted_identifiers[*]}") + + # Build create + patch bodies via Python for correct JSON escaping. + # The CREATE endpoint doesn't reliably accept sql_snippets/join_specs, + # so we create first with core config, then PATCH to add them. + local python_output + python_output=$(python3 << PYEOF +import json, random, datetime, os + +def gen_id(): + t = int((datetime.datetime.now() - datetime.datetime(1582,10,15)).total_seconds() * 1e7) + hi = (t & 0xFFFFFFFFFFFF0000) | (1 << 12) | ((t & 0xFFFF) >> 4) + lo = random.getrandbits(62) | 0x8000000000000000 + return f"{hi:016x}{lo:016x}" + +tables = [{"identifier": t} for t in sorted("${tables_csv}".split(",")) if t] + +space = {"version": 2, "data_sources": {"tables": tables}} + +# Sample questions +sq_json = os.environ.get("GENIE_SAMPLE_QUESTIONS", "") +if sq_json: + try: + questions = json.loads(sq_json) + if questions: + items = [{"id": gen_id(), "question": [q]} for q in questions] + items.sort(key=lambda x: x["id"]) + space.setdefault("config", {})["sample_questions"] = items + except json.JSONDecodeError: + pass + +# Text instructions +instr = os.environ.get("GENIE_INSTRUCTIONS", "") +if instr: + space.setdefault("instructions", {})["text_instructions"] = [ + {"id": gen_id(), "content": [instr]} + ] + +# Benchmarks +bm_json = os.environ.get("GENIE_BENCHMARKS", "") +if bm_json: + try: + benchmarks = json.loads(bm_json) + if benchmarks: + items = [] + for bm in benchmarks: + items.append({ + "id": gen_id(), + "question": [bm["question"]], + "answer": [{"format": "SQL", "content": [bm["sql"]]}] + }) + items.sort(key=lambda x: x["id"]) + space["benchmarks"] = {"questions": items} + except json.JSONDecodeError: + pass + +body = { + "warehouse_id": "${warehouse_id}", + "title": "${title}", + "serialized_space": json.dumps(space, separators=(',', ':')) +} +desc = os.environ.get("GENIE_DESCRIPTION", "") +if desc: + body["description"] = desc + +# Build patch space with sql_snippets and join_specs (applied after create) +has_patch = False +patch_instructions = dict(space.get("instructions", {})) + +filt_json = os.environ.get("GENIE_SQL_FILTERS", "") +if filt_json: + try: + filters = json.loads(filt_json) + if filters: + items = [{"id": gen_id(), "sql": [f["sql"]], "display_name": f["display_name"]} for f in filters] + items.sort(key=lambda x: x["id"]) + patch_instructions.setdefault("sql_snippets", {})["filters"] = items + has_patch = True + except json.JSONDecodeError: + pass + +expr_json = os.environ.get("GENIE_SQL_EXPRESSIONS", "") +if expr_json: + try: + expressions = json.loads(expr_json) + if expressions: + items = [{"id": gen_id(), "alias": e["alias"], "sql": [e["sql"]]} for e in expressions] + items.sort(key=lambda x: x["id"]) + patch_instructions.setdefault("sql_snippets", {})["expressions"] = items + has_patch = True + except json.JSONDecodeError: + pass + +meas_json = os.environ.get("GENIE_SQL_MEASURES", "") +if meas_json: + try: + measures = json.loads(meas_json) + if measures: + items = [{"id": gen_id(), "alias": m["alias"], "sql": [m["sql"]]} for m in measures] + items.sort(key=lambda x: x["id"]) + patch_instructions.setdefault("sql_snippets", {})["measures"] = items + has_patch = True + except json.JSONDecodeError: + pass + +join_json = os.environ.get("GENIE_JOIN_SPECS", "") +if join_json: + try: + joins = json.loads(join_json) + if joins: + items = [] + for j in joins: + items.append({ + "id": gen_id(), + "left": {"identifier": j["left_table"]}, + "right": {"identifier": j["right_table"]}, + "sql": [j["sql"]], + }) + items.sort(key=lambda x: x["id"]) + patch_instructions["join_specs"] = items + has_patch = True + except json.JSONDecodeError: + pass + +patch_body = None +if has_patch: + patch_space = dict(space) + patch_space["instructions"] = patch_instructions + patch_body = {"serialized_space": json.dumps(patch_space, separators=(',', ':'))} + +output = {"create": body} +if patch_body: + output["patch"] = patch_body +print(json.dumps(output)) +PYEOF + ) + + local create_body + create_body=$(echo "$python_output" | python3 -c "import sys,json; print(json.dumps(json.load(sys.stdin)['create']))") + + local patch_body + patch_body=$(echo "$python_output" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps(d['patch']) if 'patch' in d else '')") + + local tables_display + tables_display=$(printf '%s\n' "${sorted_identifiers[@]}" | tr '\n' ' ') + echo "Creating Genie Space '${title}' with warehouse ${warehouse_id} and ${#sorted_identifiers[@]} tables: ${tables_display}" + + local tmpfile + tmpfile=$(mktemp) + echo "$create_body" > "$tmpfile" + + local response + response=$(curl -s -w "\n%{http_code}" -X POST \ + -H "${UA_HEADER}" \ + -H "Authorization: Bearer ${token}" \ + -H "Content-Type: application/json" \ + -d @"${tmpfile}" \ + "${workspace_url}/api/2.0/genie/spaces") + rm -f "$tmpfile" + + local http_code + http_code=$(echo "$response" | tail -n1) + local response_body + response_body=$(echo "$response" | sed '$d') + + if [[ "$http_code" != "200" && "$http_code" != "201" ]]; then + echo "Create Genie Space failed (HTTP ${http_code})." + echo "API response: ${response_body}" + exit 1 + fi + + local space_id + space_id=$(echo "$response_body" | grep -o '"space_id"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/.*"\([^"]*\)"$/\1/') + if [[ -z "$space_id" ]]; then + space_id=$(echo "$response_body" | jq -r '.space_id // empty' 2>/dev/null) + fi + if [[ -z "$space_id" ]]; then + echo "Created space but could not parse space_id from response. Response: ${response_body}" + exit 1 + fi + + echo "Genie Space created: ${space_id}" + + # Save space_id to file for Terraform lifecycle (destroy) + if [[ -n "${GENIE_ID_FILE:-}" ]]; then + echo "$space_id" > "$GENIE_ID_FILE" + echo "Space ID saved to ${GENIE_ID_FILE}" + fi + + # PATCH to add sql_snippets and join_specs (not supported on CREATE) + if [[ -n "$patch_body" ]]; then + echo "Updating Genie Space with sql_snippets and join_specs..." + local patch_tmpfile + patch_tmpfile=$(mktemp) + echo "$patch_body" > "$patch_tmpfile" + + local patch_response + patch_response=$(curl -s -w "\n%{http_code}" -X PATCH \ + -H "${UA_HEADER}" \ + -H "Authorization: Bearer ${token}" \ + -H "Content-Type: application/json" \ + -d @"${patch_tmpfile}" \ + "${workspace_url}/api/2.0/genie/spaces/${space_id}") + rm -f "$patch_tmpfile" + + local patch_http_code + patch_http_code=$(echo "$patch_response" | tail -n1) + + if [[ "$patch_http_code" == "200" || "$patch_http_code" == "201" ]]; then + echo "Genie Space updated with sql_snippets and join_specs." + else + local patch_response_body + patch_response_body=$(echo "$patch_response" | sed '$d') + echo "WARNING: Failed to update Genie Space with sql_snippets/join_specs (HTTP ${patch_http_code})." + echo " API response: ${patch_response_body}" + echo " The space was created successfully. You can add sql_snippets and join_specs manually via the Genie UI." + fi + fi + + echo "Setting ACLs for groups..." + set_genie_acls "$workspace_url" "$token" "$space_id" + echo "Done. Genie Space ID: ${space_id}" +} + +# ---------- Trash (delete) a Genie Space ---------- +trash_genie_space() { + local workspace_url="${DATABRICKS_HOST}" + workspace_url="${workspace_url%/}" + + if [[ -z "$workspace_url" ]]; then + echo "Need workspace URL. Set DATABRICKS_HOST." >&2 + exit 1 + fi + + local token + token=$(resolve_token "$workspace_url" "") || exit 1 + + local space_id="" + + # Read space_id from the ID file + if [[ -n "${GENIE_ID_FILE:-}" && -f "${GENIE_ID_FILE}" ]]; then + space_id=$(cat "${GENIE_ID_FILE}" | tr -d '[:space:]') + fi + + if [[ -z "$space_id" ]]; then + echo "No Genie Space ID file found at ${GENIE_ID_FILE:-}. Nothing to trash." + exit 0 + fi + + echo "Trashing Genie Space ${space_id}..." + local response + response=$(curl -s -w "\n%{http_code}" -X DELETE \ + -H "${UA_HEADER}" \ + -H "Authorization: Bearer ${token}" \ + "${workspace_url}/api/2.0/genie/spaces/${space_id}") + + local http_code + http_code=$(echo "$response" | tail -n1) + local response_body + response_body=$(echo "$response" | sed '$d') + + if [[ "$http_code" == "200" || "$http_code" == "204" ]]; then + echo "Genie Space ${space_id} trashed successfully." + rm -f "${GENIE_ID_FILE}" + elif [[ "$http_code" == "404" ]]; then + echo "Genie Space ${space_id} not found (already deleted). Cleaning up ID file." + rm -f "${GENIE_ID_FILE}" + else + echo "Failed to trash Genie Space (HTTP ${http_code})." + echo "API response: ${response_body}" + exit 1 + fi +} + +# ---------- Main ---------- +COMMAND="${1:-create}" +shift || true + +if [[ "$COMMAND" == "create" ]]; then + WORKSPACE_URL="${1:-${DATABRICKS_HOST}}" + EXPLICIT_TOKEN="${2:-}" + TITLE="${3:-${GENIE_TITLE:-ABAC Genie Space}}" + WAREHOUSE_ID="${4:-${GENIE_WAREHOUSE_ID:-}}" + + if [[ -z "$WORKSPACE_URL" ]]; then + echo "Need workspace URL. Set DATABRICKS_HOST or pass as first argument." + exit 1 + fi + + TOKEN=$(resolve_token "$WORKSPACE_URL" "$EXPLICIT_TOKEN") || exit 1 + + if [[ -z "$WAREHOUSE_ID" ]]; then + WAREHOUSE_ID=$(read_warehouse_from_tfvars) + fi + if [[ -z "$WAREHOUSE_ID" ]]; then + echo "No warehouse ID found. Set GENIE_WAREHOUSE_ID, pass as argument, or configure sql_warehouse_id in env.auto.tfvars." + exit 1 + fi + + # Require groups for create + if [[ -z "${GENIE_GROUPS_CSV:-}" ]]; then + echo "ERROR: GENIE_GROUPS_CSV not set. Pass comma-separated group names." >&2 + echo " Example: GENIE_GROUPS_CSV='Analyst,Admin' $0 create" >&2 + exit 1 + fi + + create_genie_space "$WORKSPACE_URL" "$TOKEN" "$TITLE" "$WAREHOUSE_ID" + +elif [[ "$COMMAND" == "set-acls" ]]; then + WORKSPACE_URL="${1:-${DATABRICKS_HOST}}" + EXPLICIT_TOKEN="${2:-}" + SPACE_ID="${3:-${GENIE_SPACE_OBJECT_ID:-}}" + + if [[ -z "$WORKSPACE_URL" ]]; then + echo "Need workspace URL. Set DATABRICKS_HOST or pass as first argument." + exit 1 + fi + + TOKEN=$(resolve_token "$WORKSPACE_URL" "$EXPLICIT_TOKEN") || exit 1 + + if [[ -z "$SPACE_ID" ]]; then + echo "Genie Space ID required. Set GENIE_SPACE_OBJECT_ID or pass as third argument." + exit 1 + fi + + # Require groups for set-acls + if [[ -z "${GENIE_GROUPS_CSV:-}" ]]; then + echo "ERROR: GENIE_GROUPS_CSV not set. Pass comma-separated group names." >&2 + echo " Example: GENIE_GROUPS_CSV='Analyst,Admin' $0 set-acls" >&2 + exit 1 + fi + + set_genie_acls "$WORKSPACE_URL" "$TOKEN" "$SPACE_ID" + +elif [[ "$COMMAND" == "trash" ]]; then + trash_genie_space + +else + usage +fi diff --git a/uc-quickstart/utils/genie/aws/scripts/import_existing.sh b/uc-quickstart/utils/genie/aws/scripts/import_existing.sh new file mode 100755 index 00000000..067b7522 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/scripts/import_existing.sh @@ -0,0 +1,179 @@ +#!/usr/bin/env bash +# ============================================================================= +# Import existing Databricks resources into Terraform state +# ============================================================================= +# Imports groups, tag policies, and FGAC policies that already exist in +# Databricks so that Terraform can manage them without "already exists" errors. +# +# Prerequisites: +# - auth.auto.tfvars configured with valid credentials +# - env.auto.tfvars configured with uc_tables and environment settings +# - abac.auto.tfvars configured with groups/tag_policies/fgac_policies +# - terraform init already run +# +# Usage: +# ./scripts/import_existing.sh # import all resource types +# ./scripts/import_existing.sh --groups-only # import only groups +# ./scripts/import_existing.sh --tags-only # import only tag policies +# ./scripts/import_existing.sh --fgac-only # import only FGAC policies +# ./scripts/import_existing.sh --dry-run # show commands without running +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +MODULE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +DRY_RUN=false +IMPORT_GROUPS=true +IMPORT_TAGS=true +IMPORT_FGAC=true + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + --groups-only) IMPORT_TAGS=false; IMPORT_FGAC=false ;; + --tags-only) IMPORT_GROUPS=false; IMPORT_FGAC=false ;; + --fgac-only) IMPORT_GROUPS=false; IMPORT_TAGS=false ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--groups-only|--tags-only|--fgac-only]" + exit 0 + ;; + *) + echo "Unknown argument: $arg" + echo "Usage: $0 [--dry-run] [--groups-only|--tags-only|--fgac-only]" + exit 1 + ;; + esac +done + +cd "$MODULE_DIR" + +if [ ! -f abac.auto.tfvars ]; then + echo "ERROR: abac.auto.tfvars not found. Configure it before importing." + exit 1 +fi + +if [ ! -d .terraform ]; then + echo "ERROR: .terraform/ not found. Run 'terraform init' first." + exit 1 +fi + +run_import() { + local address="$1" + local id="$2" + + if $DRY_RUN; then + echo " [DRY RUN] terraform import '$address' '$id'" + else + echo " Importing: $address -> $id" + if terraform import "$address" "$id" 2>&1; then + echo " āœ“ Imported $address" + else + echo " āœ— Failed to import $address (may not exist or already in state)" + fi + fi +} + +# Extract group names from abac.auto.tfvars using grep/sed +extract_group_names() { + python3 -c " +import hcl2, sys +with open('abac.auto.tfvars') as f: + cfg = hcl2.load(f) +for name in cfg.get('groups', {}): + print(name) +" 2>/dev/null || { + echo "WARNING: Could not parse abac.auto.tfvars with python-hcl2." >&2 + echo "Install with: pip install python-hcl2" >&2 + } +} + +extract_tag_keys() { + python3 -c " +import hcl2, sys +with open('abac.auto.tfvars') as f: + cfg = hcl2.load(f) +for tp in cfg.get('tag_policies', []): + print(tp.get('key', '')) +" 2>/dev/null || { + echo "WARNING: Could not parse abac.auto.tfvars with python-hcl2." >&2 + } +} + +extract_fgac_names() { + python3 -c " +import hcl2, sys +with open('abac.auto.tfvars') as f: + cfg = hcl2.load(f) +for p in cfg.get('fgac_policies', []): + name = p.get('name', '') + catalog = p.get('catalog', '') + if name and catalog: + print(name + '|' + catalog + '_' + name) +" 2>/dev/null || { + echo "WARNING: Could not parse tfvars files with python-hcl2." >&2 + } +} + +echo "============================================" +echo " Import Existing Resources into Terraform" +echo "============================================" +echo "" + +imported=0 +skipped=0 + +if $IMPORT_GROUPS; then + echo "--- Groups ---" + group_names=$(extract_group_names) + if [ -z "$group_names" ]; then + echo " No groups found in abac.auto.tfvars." + else + while IFS= read -r name; do + [ -z "$name" ] && continue + run_import "databricks_group.groups[\"$name\"]" "$name" + ((imported++)) || true + done <<< "$group_names" + fi + echo "" +fi + +if $IMPORT_TAGS; then + echo "--- Tag Policies ---" + tag_keys=$(extract_tag_keys) + if [ -z "$tag_keys" ]; then + echo " No tag policies found in abac.auto.tfvars." + else + while IFS= read -r key; do + [ -z "$key" ] && continue + run_import "databricks_tag_policy.policies[\"$key\"]" "$key" + ((imported++)) || true + done <<< "$tag_keys" + fi + echo "" +fi + +if $IMPORT_FGAC; then + echo "--- FGAC Policies ---" + fgac_entries=$(extract_fgac_names) + if [ -z "$fgac_entries" ]; then + echo " No FGAC policies found in abac.auto.tfvars." + else + while IFS='|' read -r policy_key policy_name; do + [ -z "$policy_key" ] && continue + run_import "databricks_policy_info.policies[\"$policy_key\"]" "$policy_name" + ((imported++)) || true + done <<< "$fgac_entries" + fi + echo "" +fi + +echo "============================================" +if $DRY_RUN; then + echo " Dry run complete. $imported import(s) would be attempted." +else + echo " Done. $imported import(s) attempted." +fi +echo " Next: terraform plan (to verify state is consistent)" +echo "============================================" diff --git a/uc-quickstart/utils/genie/aws/scripts/sync_tag_policies.py b/uc-quickstart/utils/genie/aws/scripts/sync_tag_policies.py new file mode 100644 index 00000000..289b3c5f --- /dev/null +++ b/uc-quickstart/utils/genie/aws/scripts/sync_tag_policies.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""Sync tag policy values from abac.auto.tfvars to Databricks via SDK. + +The Databricks Terraform provider has a bug where it reorders tag policy +values after apply, causing "Provider produced inconsistent result" errors. +This script bypasses Terraform by updating tag policy values directly via +the Databricks SDK, so Terraform can use ignore_changes = [values] safely. + +Usage: + python3 scripts/sync_tag_policies.py [path/to/abac.auto.tfvars] +""" +import os +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_DIR = SCRIPT_DIR.parent + + +def _load_auth(): + """Read auth.auto.tfvars and set SDK env vars.""" + auth_path = PROJECT_DIR / "auth.auto.tfvars" + if not auth_path.exists(): + return + try: + import hcl2 + except ImportError: + import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "python-hcl2"]) + import hcl2 + + with open(auth_path) as f: + cfg = hcl2.load(f) + + mapping = { + "databricks_workspace_host": "DATABRICKS_HOST", + "databricks_client_id": "DATABRICKS_CLIENT_ID", + "databricks_client_secret": "DATABRICKS_CLIENT_SECRET", + } + for tfvar_key, env_key in mapping.items(): + val = cfg.get(tfvar_key, "") + if val and not os.environ.get(env_key): + os.environ[env_key] = val + + +def main(): + tfvars_path = Path(sys.argv[1]) if len(sys.argv) > 1 else PROJECT_DIR / "abac.auto.tfvars" + if not tfvars_path.exists(): + print(f" [SKIP] {tfvars_path} not found") + return + + import hcl2 + + with open(tfvars_path) as f: + config = hcl2.load(f) + + desired_policies = config.get("tag_policies", []) + if not desired_policies: + print(" [SKIP] No tag_policies found in config") + return + + _load_auth() + + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.tags import TagPolicy, Value + w = WorkspaceClient(product="genierails", product_version="0.1.0") + + existing = {} + for tp in w.tag_policies.list_tag_policies(): + existing[tp.tag_key] = set(v.name for v in (tp.values or [])) + + updated = 0 + for tp in desired_policies: + key = tp["key"] + desired_values = set(tp["values"]) + current_values = existing.get(key) + + if current_values is None: + continue + + if desired_values == current_values: + continue + + missing = desired_values - current_values + removed = current_values - desired_values + all_values = sorted(desired_values) + policy = TagPolicy( + tag_key=key, + values=[Value(name=v) for v in all_values], + ) + try: + w.tag_policies.update_tag_policy(tag_key=key, tag_policy=policy, update_mask="values") + changes = [] + if missing: + changes.append(f"added {sorted(missing)}") + if removed: + changes.append(f"removed {sorted(removed)}") + print(f" [SYNC] {key}: {', '.join(changes)}") + updated += 1 + except Exception as e: + print(f" [ERROR] {key}: {e}") + + if updated: + print(f" Synced {updated} tag policy/ies") + else: + print(" Tag policies already in sync") + + +if __name__ == "__main__": + main() diff --git a/uc-quickstart/utils/genie/aws/tag_policies.tf b/uc-quickstart/utils/genie/aws/tag_policies.tf new file mode 100644 index 00000000..32fdced9 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/tag_policies.tf @@ -0,0 +1,26 @@ +# ============================================================================ +# Unity Catalog Tag Policies (data-driven) +# ============================================================================ +# Creates governed tag policies from var.tag_policies. Each entry defines a +# tag key and its allowed values. Tag policies must exist before tags can be +# assigned to entities and before FGAC policies can reference them. +# +# IMPORTANT: ignore_changes on values is required because the Databricks +# provider has a bug where it reorders tag policy values after apply, causing +# "Provider produced inconsistent result" errors. Tag policy value updates +# are handled externally by `make sync-tags` (which calls the Databricks +# SDK to update values before terraform apply). +# ============================================================================ + +resource "databricks_tag_policy" "policies" { + for_each = { for tp in var.tag_policies : tp.key => tp } + + provider = databricks.workspace + tag_key = each.value.key + description = each.value.description + values = [for v in each.value.values : { name = v }] + + lifecycle { + ignore_changes = [values] + } +} diff --git a/uc-quickstart/utils/genie/aws/test.sh b/uc-quickstart/utils/genie/aws/test.sh new file mode 100755 index 00000000..4833842c --- /dev/null +++ b/uc-quickstart/utils/genie/aws/test.sh @@ -0,0 +1,141 @@ +#!/usr/bin/env bash +# ============================================================================= +# End-to-end validation test for ABAC module examples +# ============================================================================= +# Validates each example config with: +# 1. validate_abac.py (structure, cross-refs, naming) +# 2. terraform validate (HCL syntax against provider schema) +# +# Usage: +# ./test.sh # run all checks +# ./test.sh --skip-tf # skip terraform validate (no init required) +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +SKIP_TF=false +for arg in "$@"; do + case "$arg" in + --skip-tf) SKIP_TF=true ;; + -h|--help) echo "Usage: $0 [--skip-tf]"; exit 0 ;; + esac +done + +PASS=0 +FAIL=0 +ERRORS="" + +report() { + local status="$1" + local msg="$2" + if [ "$status" = "PASS" ]; then + echo " āœ“ $msg" + ((PASS++)) + else + echo " āœ— $msg" + ((FAIL++)) + ERRORS="${ERRORS}\n - ${msg}" + fi +} + +echo "============================================" +echo " ABAC Module — End-to-End Validation" +echo "============================================" +echo "" + +# --- Check prerequisites --- +if ! python3 -c "import hcl2" 2>/dev/null; then + echo "ERROR: python-hcl2 is required. Install with: pip install python-hcl2" + exit 2 +fi + +# --- Validate finance example --- +echo "--- Finance Example ---" +FINANCE_TFVARS="examples/finance/finance.tfvars.example" +FINANCE_SQL="examples/finance/0.1finance_abac_functions.sql" + +if [ -f "$FINANCE_TFVARS" ]; then + if python3 validate_abac.py "$FINANCE_TFVARS" "$FINANCE_SQL" > /dev/null 2>&1; then + report "PASS" "finance: validate_abac.py passed" + else + report "FAIL" "finance: validate_abac.py failed" + fi +else + report "FAIL" "finance: $FINANCE_TFVARS not found" +fi + +# --- Validate healthcare example --- +echo "" +echo "--- Healthcare Example ---" +HC_TFVARS="examples/healthcare/healthcare.tfvars.example" +HC_SQL="examples/healthcare/masking_functions.sql" + +if [ -f "$HC_TFVARS" ]; then + if [ -f "$HC_SQL" ]; then + if python3 validate_abac.py "$HC_TFVARS" "$HC_SQL" > /dev/null 2>&1; then + report "PASS" "healthcare: validate_abac.py passed" + else + report "FAIL" "healthcare: validate_abac.py failed" + fi + else + if python3 validate_abac.py "$HC_TFVARS" > /dev/null 2>&1; then + report "PASS" "healthcare: validate_abac.py passed (no SQL file)" + else + report "FAIL" "healthcare: validate_abac.py failed" + fi + fi +else + report "FAIL" "healthcare: $HC_TFVARS not found" +fi + +# --- Validate abac.auto.tfvars.example skeleton --- +echo "" +echo "--- Skeleton Example ---" +SKELETON_TFVARS="abac.auto.tfvars.example" + +if [ -f "$SKELETON_TFVARS" ]; then + if python3 validate_abac.py "$SKELETON_TFVARS" > /dev/null 2>&1; then + report "PASS" "skeleton: validate_abac.py passed" + else + report "FAIL" "skeleton: validate_abac.py failed" + fi +else + report "FAIL" "skeleton: $SKELETON_TFVARS not found" +fi + +# --- Terraform validate (requires terraform init) --- +if ! $SKIP_TF; then + echo "" + echo "--- Terraform Validate ---" + + TMPDIR_TF=$(mktemp -d) + trap 'rm -rf "$TMPDIR_TF"' EXIT + + cp "$FINANCE_TFVARS" "$TMPDIR_TF/abac.auto.tfvars" 2>/dev/null || true + cp auth.auto.tfvars.example "$TMPDIR_TF/auth.auto.tfvars" 2>/dev/null || true + cp env.auto.tfvars.example "$TMPDIR_TF/env.auto.tfvars" 2>/dev/null || true + + if terraform -chdir="$SCRIPT_DIR" validate -no-color > "$TMPDIR_TF/tf_validate.log" 2>&1; then + report "PASS" "terraform validate passed" + else + report "FAIL" "terraform validate failed (see output below)" + cat "$TMPDIR_TF/tf_validate.log" | head -20 + fi +fi + +# --- Summary --- +echo "" +echo "============================================" +TOTAL=$((PASS + FAIL)) +if [ "$FAIL" -eq 0 ]; then + echo " RESULT: ALL PASSED ($PASS/$TOTAL checks)" +else + echo " RESULT: $FAIL FAILED ($PASS passed, $FAIL failed)" + echo -e " Failures:$ERRORS" +fi +echo "============================================" + +exit "$FAIL" diff --git a/uc-quickstart/utils/genie/aws/uc_grants.tf b/uc-quickstart/utils/genie/aws/uc_grants.tf new file mode 100644 index 00000000..9cbbb99e --- /dev/null +++ b/uc-quickstart/utils/genie/aws/uc_grants.tf @@ -0,0 +1,53 @@ +# ============================================================================ +# Unity Catalog Data Access Grants +# ============================================================================ +# Uses databricks_grant (singular) which is ADDITIVE — it only manages the +# grants for each specified principal without removing existing permissions +# from other principals on the catalog. +# +# Multi-catalog: catalogs are auto-derived from fully-qualified entity names +# in tag_assignments and catalog fields in fgac_policies. No manual list needed. +# ============================================================================ + +locals { + _ta_catalogs = [ + for ta in var.tag_assignments : + split(".", ta.entity_name)[0] + ] + + _fgac_catalogs = [ + for p in var.fgac_policies : + p.catalog + ] + + all_catalogs = distinct(concat( + local._ta_catalogs, + local._fgac_catalogs, + )) +} + +resource "databricks_grant" "terraform_sp_manage_catalog" { + for_each = toset(local.all_catalogs) + + provider = databricks.workspace + catalog = each.value + principal = var.databricks_client_id + privileges = ["USE_CATALOG", "USE_SCHEMA", "EXECUTE", "MANAGE", "CREATE_FUNCTION"] +} + +resource "databricks_grant" "catalog_access" { + for_each = { + for pair in setproduct(local.all_catalogs, keys(var.groups)) : + "${pair[0]}|${pair[1]}" => { catalog = pair[0], group = pair[1] } + } + + provider = databricks.workspace + catalog = each.value.catalog + principal = each.value.group + privileges = ["USE_CATALOG", "USE_SCHEMA", "SELECT"] + + depends_on = [ + databricks_group.groups, + databricks_mws_permission_assignment.group_assignments, + ] +} diff --git a/uc-quickstart/utils/genie/aws/validate_abac.py b/uc-quickstart/utils/genie/aws/validate_abac.py new file mode 100644 index 00000000..48fd6e4f --- /dev/null +++ b/uc-quickstart/utils/genie/aws/validate_abac.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +""" +Validate AI-generated ABAC configuration before terraform apply. + +Checks: + 1. abac.auto.tfvars structure and required fields + 2. masking_functions.sql function definitions + 3. Cross-references between both files + +Usage: + pip install python-hcl2 # one-time + python validate_abac.py abac.auto.tfvars masking_functions.sql + python validate_abac.py abac.auto.tfvars # skip SQL check +""" + +import sys +import re +import argparse +from pathlib import Path + +try: + import hcl2 +except ImportError: + print("ERROR: python-hcl2 is required. Install with:") + print(" pip install python-hcl2") + sys.exit(2) + +VALID_ENTITY_TYPES = {"tables", "columns"} +VALID_POLICY_TYPES = {"POLICY_TYPE_COLUMN_MASK", "POLICY_TYPE_ROW_FILTER"} +BUILTIN_PRINCIPALS = {"account users"} + +COLUMN_MASK_REQUIRED = {"name", "policy_type", "catalog", "to_principals", "match_condition", "match_alias", "function_name", "function_catalog", "function_schema"} +ROW_FILTER_REQUIRED = {"name", "policy_type", "catalog", "to_principals", "function_name", "function_catalog", "function_schema"} + + +class ValidationResult: + def __init__(self): + self.errors: list[str] = [] + self.warnings: list[str] = [] + self.info: list[str] = [] + + def error(self, msg: str): + self.errors.append(msg) + + def warn(self, msg: str): + self.warnings.append(msg) + + def ok(self, msg: str): + self.info.append(msg) + + @property + def passed(self) -> bool: + return len(self.errors) == 0 + + def print_report(self): + width = 60 + print("=" * width) + print(" ABAC Configuration Validation Report") + print("=" * width) + + if self.info: + for line in self.info: + print(f" [PASS] {line}") + + if self.warnings: + print() + for line in self.warnings: + print(f" [WARN] {line}") + + if self.errors: + print() + for line in self.errors: + print(f" [FAIL] {line}") + + print("-" * width) + counts = ( + f"{len(self.info)} passed, " + f"{len(self.warnings)} warnings, " + f"{len(self.errors)} errors" + ) + if self.passed: + print(f" RESULT: PASS ({counts})") + else: + print(f" RESULT: FAIL ({counts})") + print("=" * width) + + +def parse_tfvars(path: Path) -> dict: + with open(path) as f: + return hcl2.load(f) + + +def parse_sql_functions(path: Path) -> set[str]: + """Extract function names from CREATE [OR REPLACE] FUNCTION statements.""" + text = path.read_text() + pattern = re.compile( + r"CREATE\s+(?:OR\s+REPLACE\s+)?FUNCTION\s+" + r"(?:[\w]+\.[\w]+\.)?" # optional catalog.schema. prefix + r"([\w]+)\s*\(", + re.IGNORECASE, + ) + return {m.group(1) for m in pattern.finditer(text)} + + +def validate_groups(cfg: dict, result: ValidationResult): + groups = cfg.get("groups") + if not groups: + result.error("'groups' is missing or empty — at least one group is required") + return set() + if not isinstance(groups, dict): + result.error("'groups' must be a map of group_name -> { description = \"...\" }") + return set() + for name, val in groups.items(): + if not isinstance(val, dict): + result.error(f"groups[\"{name}\"] must be an object with a 'description' key") + result.ok(f"groups: {len(groups)} group(s) defined") + return set(groups.keys()) + + +def validate_tag_policies(cfg: dict, result: ValidationResult) -> dict[str, set[str]]: + """Returns a map of tag_key -> set of allowed values.""" + policies = cfg.get("tag_policies", []) + if not isinstance(policies, list): + result.error("'tag_policies' must be a list") + return {} + tag_map: dict[str, set[str]] = {} + seen_keys: set[str] = set() + for i, tp in enumerate(policies): + key = tp.get("key", "") + if not key: + result.error(f"tag_policies[{i}]: 'key' is missing") + continue + if key in seen_keys: + result.error(f"tag_policies[{i}]: duplicate key '{key}'") + seen_keys.add(key) + values = tp.get("values", []) + if not values: + result.error(f"tag_policies[{i}] (key='{key}'): 'values' is empty") + tag_map[key] = set(values) + result.ok(f"tag_policies: {len(policies)} policy/ies, {sum(len(v) for v in tag_map.values())} total values") + return tag_map + + +def validate_tag_assignments(cfg: dict, tag_map: dict[str, set[str]], result: ValidationResult): + assignments = cfg.get("tag_assignments", []) + if not isinstance(assignments, list): + result.error("'tag_assignments' must be a list") + return + seen_keys: set[str] = set() + for i, ta in enumerate(assignments): + prefix = f"tag_assignments[{i}]" + etype = ta.get("entity_type", "") + ename = ta.get("entity_name", "") + tkey = ta.get("tag_key", "") + tval = ta.get("tag_value", "") + + if etype not in VALID_ENTITY_TYPES: + result.error(f"{prefix}: entity_type '{etype}' invalid — must be 'tables' or 'columns'") + + dot_count = ename.count(".") + if etype == "tables" and dot_count != 2: + result.error( + f"{prefix}: entity_name '{ename}' must be fully qualified " + f"as 'catalog.schema.table' (expected 2 dots, got {dot_count})" + ) + if etype == "columns" and dot_count != 3: + result.error( + f"{prefix}: entity_name '{ename}' must be fully qualified " + f"as 'catalog.schema.table.column' (expected 3 dots, got {dot_count})" + ) + + if tkey and tkey not in tag_map: + result.error(f"{prefix}: tag_key '{tkey}' not defined in tag_policies") + elif tkey and tval and tval not in tag_map.get(tkey, set()): + result.error( + f"{prefix}: tag_value '{tval}' is not an allowed value for " + f"tag_key '{tkey}' — allowed: {sorted(tag_map[tkey])}" + ) + + composite = f"{etype}|{ename}|{tkey}|{tval}" + if composite in seen_keys: + result.warn(f"{prefix}: duplicate assignment ({etype}, {ename}, {tkey}={tval})") + seen_keys.add(composite) + + result.ok(f"tag_assignments: {len(assignments)} assignment(s)") + + +def validate_fgac_policies( + cfg: dict, + group_names: set[str], + tag_map: dict[str, set[str]], + sql_functions: set[str] | None, + result: ValidationResult, +): + policies = cfg.get("fgac_policies", []) + if not isinstance(policies, list): + result.error("'fgac_policies' must be a list") + return + seen_names: set[str] = set() + referenced_functions: set[str] = set() + + for i, p in enumerate(policies): + name = p.get("name", "") + prefix = f"fgac_policies[{i}] (name='{name}')" + ptype = p.get("policy_type", "") + + if not name: + result.error(f"fgac_policies[{i}]: 'name' is missing") + if name in seen_names: + result.error(f"{prefix}: duplicate policy name") + seen_names.add(name) + + if ptype not in VALID_POLICY_TYPES: + result.error(f"{prefix}: policy_type '{ptype}' invalid — must be one of {sorted(VALID_POLICY_TYPES)}") + continue + + provided = {k for k, v in p.items() if v is not None and v != "" and v != []} + + if ptype == "POLICY_TYPE_COLUMN_MASK": + missing = COLUMN_MASK_REQUIRED - provided + if missing: + result.error(f"{prefix}: COLUMN_MASK requires {sorted(missing)}") + elif ptype == "POLICY_TYPE_ROW_FILTER": + missing = ROW_FILTER_REQUIRED - provided + if missing: + result.error(f"{prefix}: ROW_FILTER requires {sorted(missing)}") + + # Validate principals reference existing groups + for principal in p.get("to_principals", []): + if principal.lower() not in BUILTIN_PRINCIPALS and principal not in group_names: + result.error( + f"{prefix}: to_principals group '{principal}' not defined in 'groups'" + ) + for principal in p.get("except_principals", []) or []: + if principal.lower() not in BUILTIN_PRINCIPALS and principal not in group_names: + result.error( + f"{prefix}: except_principals group '{principal}' not defined in 'groups'" + ) + + # Validate condition syntax — only hasTagValue() and hasTag() are allowed + condition = p.get("match_condition") or p.get("when_condition") or "" + for forbidden in ["columnName()", "tableName()", " IN (", " IN("]: + if forbidden in condition: + result.error( + f"{prefix}: condition contains '{forbidden}' which is NOT supported " + f"by Databricks ABAC. Only hasTagValue() and hasTag() are allowed." + ) + for tag_ref in re.findall(r"hasTagValue\(\s*'([^']+)'\s*,\s*'([^']+)'\s*\)", condition): + ref_key, ref_val = tag_ref + if ref_key not in tag_map: + result.error(f"{prefix}: condition references undefined tag_key '{ref_key}'") + elif ref_val not in tag_map.get(ref_key, set()): + result.error( + f"{prefix}: condition references tag_value '{ref_val}' " + f"not in tag_policy '{ref_key}' — allowed: {sorted(tag_map[ref_key])}" + ) + for tag_ref in re.findall(r"hasTag\(\s*'([^']+)'\s*\)", condition): + if tag_ref not in tag_map: + result.error(f"{prefix}: condition references undefined tag_key '{tag_ref}'") + + fn = p.get("function_name", "") + if fn: + referenced_functions.add(fn) + if "." in fn: + result.error( + f"{prefix}: function_name '{fn}' should be relative (no dots) — " + f"Terraform prepends catalog.schema automatically" + ) + + # Cross-reference with SQL file + if sql_functions is not None: + for fn in referenced_functions: + if fn not in sql_functions: + result.error( + f"function '{fn}' referenced in fgac_policies but not found " + f"in SQL file — define it with CREATE OR REPLACE FUNCTION {fn}(...)" + ) + unused = sql_functions - referenced_functions + if unused: + result.warn( + f"SQL file defines functions not used by any policy: {sorted(unused)}. " + f"These will be created but won't mask anything." + ) + + result.ok(f"fgac_policies: {len(policies)} policy/ies, {len(referenced_functions)} unique function(s)") + + +def validate_group_members(cfg: dict, group_names: set[str], result: ValidationResult): + members = cfg.get("group_members", {}) + if not isinstance(members, dict): + result.error("'group_members' must be a map of group_name -> list of user IDs") + return + for grp, ids in members.items(): + if grp not in group_names: + result.error(f"group_members: group '{grp}' not defined in 'groups'") + if not isinstance(ids, list) or not all(isinstance(x, str) for x in ids): + result.error(f"group_members[\"{grp}\"]: must be a list of user ID strings") + if members: + result.ok(f"group_members: {len(members)} group(s) with member assignments") + + +def _find_tfvars_file(tfvars_path: Path, name: str) -> Path | None: + """Locate a sibling tfvars file relative to the given tfvars file.""" + candidates = [ + tfvars_path.parent / name, + tfvars_path.parent.parent / name, + ] + for p in candidates: + if p.exists(): + return p + return None + + +def validate_auth(cfg: dict, result: ValidationResult, tfvars_path: Path): + required = [ + "databricks_account_id", + "databricks_client_id", + "databricks_client_secret", + "databricks_workspace_id", + "databricks_workspace_host", + ] + + auth_cfg = dict(cfg) + for fname in ["auth.auto.tfvars", "env.auto.tfvars"]: + found = _find_tfvars_file(tfvars_path, fname) + if found: + try: + file_cfg = parse_tfvars(found) + for k, v in file_cfg.items(): + if v and not auth_cfg.get(k): + auth_cfg[k] = v + result.ok(f"Vars loaded from {found.name}") + except Exception as e: + result.warn(f"Could not parse {found}: {e}") + + for key in required: + val = auth_cfg.get(key, "") + if not val: + result.warn(f"'{key}' is empty — fill in before terraform apply") + else: + result.ok(f"{key}: set") + + +def main(): + parser = argparse.ArgumentParser( + description="Validate AI-generated ABAC configuration files", + epilog="Example: python validate_abac.py abac.auto.tfvars masking_functions.sql", + ) + parser.add_argument("tfvars", help="Path to abac.auto.tfvars file") + parser.add_argument("sql", nargs="?", help="Path to masking_functions.sql (optional)") + args = parser.parse_args() + + tfvars_path = Path(args.tfvars) + sql_path = Path(args.sql) if args.sql else None + + if not tfvars_path.exists(): + print(f"ERROR: {tfvars_path} not found") + sys.exit(1) + + result = ValidationResult() + + # --- Parse tfvars --- + try: + cfg = parse_tfvars(tfvars_path) + except Exception as e: + result.error(f"Failed to parse {tfvars_path}: {e}") + result.print_report() + sys.exit(1) + + # --- Parse SQL (optional) --- + sql_functions: set[str] | None = None + if sql_path: + if not sql_path.exists(): + result.error(f"SQL file {sql_path} not found") + else: + sql_functions = parse_sql_functions(sql_path) + if not sql_functions: + result.warn( + f"No CREATE FUNCTION statements found in {sql_path} — " + f"is it the right file?" + ) + else: + result.ok(f"SQL file: {len(sql_functions)} function(s) found — {sorted(sql_functions)}") + + # --- Run validations --- + validate_auth(cfg, result, tfvars_path) + group_names = validate_groups(cfg, result) + tag_map = validate_tag_policies(cfg, result) + validate_tag_assignments(cfg, tag_map, result) + validate_fgac_policies(cfg, group_names, tag_map, sql_functions, result) + validate_group_members(cfg, group_names, result) + + result.print_report() + sys.exit(0 if result.passed else 1) + + +if __name__ == "__main__": + main() diff --git a/uc-quickstart/utils/genie/aws/variables.tf b/uc-quickstart/utils/genie/aws/variables.tf new file mode 100644 index 00000000..fd1d3c61 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/variables.tf @@ -0,0 +1,224 @@ +# ============================================================================ +# Variables for Generic ABAC Terraform Module +# ============================================================================ + +# ---------------------------------------------------------------------------- +# Authentication +# ---------------------------------------------------------------------------- + +variable "databricks_account_id" { + type = string + description = "The Databricks account ID" +} + +variable "databricks_client_id" { + type = string + description = "The Databricks service principal client ID for authentication" +} + +variable "databricks_client_secret" { + type = string + description = "The Databricks service principal client secret for authentication" + sensitive = true +} + +variable "databricks_workspace_id" { + type = string + description = "The Databricks workspace ID where the groups will be assigned" +} + +variable "databricks_workspace_host" { + type = string + description = "The Databricks workspace URL (e.g., https://myworkspace.cloud.databricks.com)" +} + +# ---------------------------------------------------------------------------- +# Unity Catalog tables (used by generate_abac.py only) +# ---------------------------------------------------------------------------- + +variable "uc_tables" { + type = list(string) + default = [] + description = "Tables to generate ABAC policies for. Used by generate_abac.py only; ignored by Terraform." +} + +# ---------------------------------------------------------------------------- +# SQL warehouse (shared by masking function deployment + Genie Space) +# ---------------------------------------------------------------------------- + +variable "sql_warehouse_id" { + type = string + default = "" + description = "Existing SQL warehouse ID. When set, reused for masking function deployment and Genie Space. When empty, Terraform auto-creates a serverless warehouse." +} + +# ---------------------------------------------------------------------------- +# Groups +# ---------------------------------------------------------------------------- + +variable "groups" { + type = map(object({ + description = optional(string, "") + })) + description = "Map of group name -> config. Each key becomes an account-level databricks_group, assigned to the workspace with consumer entitlements." +} + +# ---------------------------------------------------------------------------- +# Group members (optional) +# ---------------------------------------------------------------------------- + +variable "group_members" { + type = map(list(string)) + default = {} + description = "Map of group name -> list of account-level user IDs. Adds users to the corresponding group. Get IDs from Account Console > Users or SCIM API." +} + +# ---------------------------------------------------------------------------- +# Tag policies +# ---------------------------------------------------------------------------- + +variable "tag_policies" { + type = list(object({ + key = string + description = optional(string, "") + values = list(string) + })) + default = [] + description = "Tag policies to create. Each becomes a databricks_tag_policy with governed allowed values." +} + +# ---------------------------------------------------------------------------- +# Tag assignments +# ---------------------------------------------------------------------------- + +variable "tag_assignments" { + type = list(object({ + entity_type = string + entity_name = string + tag_key = string + tag_value = string + })) + default = [] + description = "Tag-to-entity mappings. entity_type is 'tables' or 'columns'. entity_name must be fully qualified (catalog.schema.table for tables, catalog.schema.table.column for columns)." +} + +# ---------------------------------------------------------------------------- +# FGAC policies +# ---------------------------------------------------------------------------- + +variable "fgac_policies" { + type = list(object({ + name = string + policy_type = string + catalog = string + to_principals = list(string) + except_principals = optional(list(string), []) + comment = optional(string, "") + match_condition = optional(string) + match_alias = optional(string) + function_name = string + function_catalog = string + function_schema = string + when_condition = optional(string) + })) + default = [] + description = "FGAC policies. catalog: which catalog the policy is scoped to. function_catalog/function_schema: where the masking UDF lives. function_name: relative UDF name (e.g. 'mask_pii_partial')." +} + +# ---------------------------------------------------------------------------- +# Genie Space +# ---------------------------------------------------------------------------- + +variable "warehouse_name" { + type = string + default = "ABAC Serverless Warehouse" + description = "Name of the auto-created serverless warehouse (only used when sql_warehouse_id is empty)." +} + +variable "genie_space_id" { + type = string + default = "" + description = "Existing Genie Space ID. When set, Terraform applies CAN_RUN ACLs for configured groups. When empty and uc_tables is non-empty, Terraform auto-creates a new Genie Space." +} + +variable "genie_space_title" { + type = string + default = "One Ready Genie Space" + description = "Title for the auto-created Genie Space (only used when genie_space_id is empty)." +} + +variable "genie_space_description" { + type = string + default = "" + description = "Optional description for the auto-created Genie Space (only used when genie_space_id is empty)." +} + +variable "genie_sample_questions" { + type = list(string) + default = [] + description = "Sample questions shown to users in the Genie Space UI. Auto-generated by generate_abac.py if not set." +} + +variable "genie_instructions" { + type = string + default = "" + description = "Text instructions for the Genie Space LLM (e.g., domain-specific guidance, calculation rules)." +} + +variable "genie_benchmarks" { + type = list(object({ + question = string + sql = string + })) + default = [] + description = "Benchmark questions with ground-truth SQL for evaluating Genie Space accuracy." +} + +variable "genie_sql_filters" { + type = list(object({ + sql = string + display_name = string + comment = string + instruction = string + })) + default = [] + description = "SQL snippet filters for the Genie Space (e.g., default WHERE clauses like active customers, completed transactions)." +} + +variable "genie_sql_expressions" { + type = list(object({ + alias = string + sql = string + display_name = string + comment = string + instruction = string + })) + default = [] + description = "SQL snippet expressions/dimensions for the Genie Space (e.g., transaction year, age bucket)." +} + +variable "genie_sql_measures" { + type = list(object({ + alias = string + sql = string + display_name = string + comment = string + instruction = string + })) + default = [] + description = "SQL snippet measures/aggregations for the Genie Space (e.g., total revenue, average risk score)." +} + +variable "genie_join_specs" { + type = list(object({ + left_table = string + left_alias = string + right_table = string + right_alias = string + sql = string + comment = string + instruction = string + })) + default = [] + description = "Join specifications between tables for the Genie Space (e.g., accounts to customers on CustomerID)." +} diff --git a/uc-quickstart/utils/genie/aws/warehouse.tf b/uc-quickstart/utils/genie/aws/warehouse.tf new file mode 100644 index 00000000..0204cd85 --- /dev/null +++ b/uc-quickstart/utils/genie/aws/warehouse.tf @@ -0,0 +1,29 @@ +# ============================================================================ +# SQL Warehouse (shared by masking function deployment + Genie Space) +# ============================================================================ +# When sql_warehouse_id is set in env.auto.tfvars, that existing warehouse is +# reused for everything. When empty, Terraform auto-creates a serverless +# warehouse. The effective ID is exposed as local.effective_warehouse_id. +# ============================================================================ + +locals { + effective_warehouse_id = ( + var.sql_warehouse_id != "" + ? var.sql_warehouse_id + : databricks_sql_endpoint.warehouse[0].id + ) +} + +resource "databricks_sql_endpoint" "warehouse" { + count = var.sql_warehouse_id != "" ? 0 : 1 + + provider = databricks.workspace + name = var.warehouse_name + cluster_size = "Small" + max_num_clusters = 1 + + enable_serverless_compute = true + warehouse_type = "PRO" + + auto_stop_mins = 15 +}