created the first version of the repo

JoshuaHarris391 · JoshuaHarris391 · commit d021685f0996 · 2025-03-14T17:15:57.000+11:00
diff --git a/README.md b/README.md
@@ -1,2 +1,61 @@
 # gen3-metadata
 User friendly tools for downloading and manipulating gen3 metadata
+
+
+## 1. Set up python venv
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+## 2. Create config file 
+```bash
+echo credentials_path=\"/path/to/credentials.json\" > .env
+```
+
+## 3. Load library
+```bash
+pip install -e .
+```
+
+
+## 4. Usage Example
+
+```python
+import os
+from gen3_metadata.parser import Gen3MetadataParser
+
+# Set up API and credentials
+api = "https://data.test.biocommons.org.au"
+key_file = os.getenv('credentials_path')
+
+# Initialize the Gen3MetadataParser
+gen3metadata = Gen3MetadataParser(api, key_file)
+
+# Fetch data for different categories
+gen3metadata.fetch_data("program1", "AusDiab_Simulated", "subject")
+gen3metadata.fetch_data("program1", "AusDiab_Simulated", "demographic")
+gen3metadata.fetch_data("program1", "AusDiab_Simulated", "medical_history")
+
+# Convert fetched data to a pandas DataFrame
+gen3metadata.data_to_pd()
+
+# Print the keys of the data sets that have been fetched
+print(gen3metadata.data_store.keys())
+
+# Return a json of one of the datasets
+gen3metadata.data_store["program1/AusDiab_Simulated/subject"]
+
+# Return the pandas dataframe of one of the datasets
+gen3metadata.data_store_pd["program1/AusDiab_Simulated/subject"]
+```
+
+The fetched data is stored in a dictionary within the `Gen3MetadataParser` instance.
+Each category of data fetched is stored as a key-value pair in this dictionary,
+where the key is the category name and the value is the corresponding data.
+This allows for easy access and manipulation of the data after it has been fetched.
+
+
+
+
diff --git a/protyping.ipynb b/protyping.ipynb
@@ -0,0 +1,79 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# testing and prototyping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gen3_metadata\n",
+    "from gen3_metadata.parser import Gen3MetadataParser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "status code: 200\n",
+      "Data for program1/AusDiab_Simulated/subject has been fetched and stored.\n",
+      "status code: 200\n",
+      "Data for program1/AusDiab_Simulated/demographic has been fetched and stored.\n",
+      "status code: 200\n",
+      "Data for program1/AusDiab_Simulated/medical_history has been fetched and stored.\n",
+      "Converting program1/AusDiab_Simulated/subject to pandas dataframe...\n",
+      "Converting program1/AusDiab_Simulated/demographic to pandas dataframe...\n",
+      "Converting program1/AusDiab_Simulated/medical_history to pandas dataframe...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from gen3_metadata.parser import Gen3MetadataParser\n",
+    "\n",
+    "api = \"https://data.test.biocommons.org.au\"\n",
+    "key_file = os.getenv('credentials_path')\n",
+    "gen3metadata = Gen3MetadataParser(api, key_file)\n",
+    "\n",
+    "\n",
+    "gen3metadata.fetch_data(\"program1\", \"AusDiab_Simulated\", \"subject\")\n",
+    "gen3metadata.fetch_data(\"program1\", \"AusDiab_Simulated\", \"demographic\")\n",
+    "gen3metadata.fetch_data(\"program1\", \"AusDiab_Simulated\", \"medical_history\")\n",
+    "\n",
+    "gen3metadata.data_to_pd()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+requests
+pandas
+setuptools
diff --git a/setup.py b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="gen3_metadata",  # Name of your package
+    version="0.1",  # Version number
+    packages=find_packages(where="src"),  # Automatically find packages in src/
+    package_dir={"": "src"},  # Tell setuptools that packages are in src/
+    install_requires=open("requirements.txt").read().splitlines(),  # Add dependencies from requirements.txt
+    description="A library for downloading and manipulating gen3 metadata",
+    author="Joshua Harris",
+    author_email="harjo391@gmail.com",
+    url="https://github.com/AustralianBioCommons/gen3-metadata",  # Optional: GitHub or project URL
+)
diff --git a/src/gen3_metadata/__init__.py b/src/gen3_metadata/__init__.py
@@ -0,0 +1 @@
+from .parser import *
diff --git a/src/gen3_metadata/parser.py b/src/gen3_metadata/parser.py
@@ -0,0 +1,56 @@
+import json
+import requests
+import os
+
+class Gen3MetadataParser:
+    def __init__(self, api_url, key_file_path):
+        self.api_url = api_url
+        self.key_file_path = key_file_path
+        self.headers = self._authenticate()
+        self.data_store = {} 
+        self.data_store_pd = {} # Initialize a dictionary to store fetched data
+
+    def _load_api_key(self):
+        with open(self.key_file_path) as json_file:
+            return json.load(json_file)
+
+    def _authenticate(self):
+        key = self._load_api_key()
+        response = requests.post(f"{self.api_url}/user/credentials/cdis/access_token", json=key)
+        response.raise_for_status()  # Ensure any HTTP errors are raised
+        access_token = response.json()['access_token']
+        return {'Authorization': f"bearer {access_token}"}
+    
+    def json_to_pdf(self, json_data):
+        import pandas as pd
+        return pd.json_normalize(json_data)
+
+    def fetch_data(self, program_name, project_code, node_label, return_data=False):
+        try:
+            url = f"{self.api_url}/api/v0/submission/{program_name}/{project_code}/export/?node_label={node_label}&format=json"
+            response = requests.get(url, headers=self.headers)
+            print(f"status code: {response.status_code}")
+            response.raise_for_status()  # Ensure any HTTP errors are raised
+            data = response.json()
+            
+            # Create a key from program_name, project_code, and node_label
+            key = f"{program_name}/{project_code}/{node_label}"
+            
+            # Store the data in the dictionary with the created key
+            self.data_store[key] = data
+            
+            if return_data:
+                return data
+            else:
+                print(f"Data for {key} has been fetched and stored.")
+        except requests.exceptions.HTTPError as http_err:
+            print(f"HTTP error occurred: {http_err} - Status Code: {response.status_code}")
+        except Exception as err:
+            print(f"An error occurred: {err}")
+            
+    def data_to_pd(self):
+        import pandas as pd
+        for key, value in self.data_store.items():
+            print(f"Converting {key} to pandas dataframe...")
+            self.data_store_pd[key] = self.json_to_pdf(value['data'])
+        return