Added data_gen test

Amitayush Thakur · Amitayush Thakur · commit abbeafa4c5b5 · 2025-02-10T01:52:09.000-06:00
diff --git a/.github/workflows/github-build-actions.yaml b/.github/workflows/github-build-actions.yaml
@@ -75,9 +75,16 @@ jobs:
       - name: List repository files (debug step)
         run: find . -type f
 
-      - name: Run tests
+      - name: Run Simple Env Test
         shell: bash
         run: |
           eval $(opam env)
           source $HOME/.elan/env
-          python src/test/simple_env_test.py
+          python src/test/simple_env_test.py
+
+      - name: Run Data Gen Test
+        shell: bash
+        run: |
+          eval $(opam env)
+          source $HOME/.elan/env
+          python src/test/simple_data_gen_test.py
diff --git a/README.md b/README.md
@@ -193,6 +193,8 @@ action = ProofAction(
 
 ## Generating Proof Step Data:
 
+>NOTE: Make sure that you have installed the `itp-interface` package before running the following commands.
+
 1.a. You need to run the following command to generate sample proof step data for Lean 4:
 ```
 run-itp-data-gen --config-dir src/itp_interface/main/configs  --config-name simple_lean_data_gen
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
 build-backend = "hatchling.build"
 [project]
 name = "itp_interface"
-version = "1.1.1"
+version = "1.1.2"
 authors = [
   { name="Amitayush Thakur", email="amitayush@utexas.edu" },
 ]
diff --git a/src/itp_interface/main/configs/benchmark/simple_benchmark_lean.yaml b/src/itp_interface/main/configs/benchmark/simple_benchmark_lean.yaml
@@ -6,7 +6,7 @@ few_shot_metadata_filename_for_retrieval:
 dfs_data_path_for_retrieval: 
 dfs_metadata_filename_for_retrieval: 
 datasets:
-  - project: data/test/lean4_proj
+  - project: src/data/test/lean4_proj
     files:
       - path: Lean4Proj/Basic.lean
         theorems: "*"
diff --git a/src/itp_interface/main/run_tool.py b/src/itp_interface/main/run_tool.py
@@ -420,17 +420,19 @@ def run_data_generation(experiment: Experiments, log_dir: str, logger: logging.L
             time.sleep(10)
     logger.info(f"Finished running experiment: \n{experiment.to_json(indent=4)}")
 
-@hydra.main(config_path="configs", config_name="experiments", version_base="1.2")
+@hydra.main(config_path="configs", config_name="simple_lean_data_gen", version_base="1.2")
 def main(cfg):
     os.environ["PYTHONPATH"] = f"{root_dir}:{os.environ.get('PYTHONPATH', '')}"
     # RayUtils.init_ray(num_of_cpus=cfg.run_settings.pool_size, object_store_memory_in_gb=100)
     experiment = parse_config(cfg)
-    os.chdir(root_dir)
+    # os.chdir(root_dir)
     # top_level_dir = os.path.dirname(root_dir)
     # top_level_dir = os.path.dirname(top_level_dir)
     # os.chdir(top_level_dir)
     log_dir = ".log/data_generation/benchmark/{}/{}".format(experiment.benchmark.name, time.strftime("%Y%m%d-%H%M%S"))
     os.makedirs(log_dir, exist_ok=True)
+    abs_path = os.path.abspath(log_dir)
+    print(f"Log Dir: {abs_path}")
     log_path = os.path.join(log_dir, "eval.log")
     logger = setup_logger(__name__, log_path, logging.INFO, '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     logger.info(f"Pid: {os.getpid()}")
diff --git a/src/test/simple_data_gen_test.py b/src/test/simple_data_gen_test.py
@@ -0,0 +1,52 @@
+import unittest
+import os
+import subprocess
+
+class TestDataGen(unittest.TestCase):
+    def test_proof_step_data_gen(self):
+        """
+        Test that the 'run-itp-data-gen' command runs successfully with the given configuration.
+        """
+        # Construct the command as a single string.
+        command = (
+            "run-itp-data-gen --config-dir=src/itp_interface/main/configs "
+            "--config-name=simple_lean_data_gen.yaml"
+        )
+
+        try:
+            # Run the command using shell=True so that the shell does the PATH lookup.
+            result = subprocess.run(
+                command,
+                shell=True,
+                capture_output=True,
+                text=True,
+                timeout=700
+            )
+        except subprocess.TimeoutExpired as e:
+            self.fail(f"'run-itp-data-gen' command timed out: {e}")
+        except Exception as e:
+            self.fail(f"Error running 'proof-wala-search': {e}")
+
+        # Check that the command exited with a return code of 0.
+        self.assertEqual(
+            result.returncode, 0,
+            msg=f"'run-itp-data-gen' failed with return code {result.returncode}. Stderr: {result.stderr}"
+        )
+
+        # Print all the files in the .log/data_generation/benchmark/simple_benchmark_lean
+        # directory to see what was generated.
+        # Do a list and pick the last folder in the list as per the sorted order
+        dirs = sorted(os.listdir(".log/data_generation/benchmark/simple_benchmark_lean"))
+        print(dirs)
+        last_dir = dirs[-1]
+        train_data = os.path.join(".log/data_generation/benchmark/simple_benchmark_lean", last_dir, "train")
+        data_gen_file = os.path.join(train_data, "local_data_0000000016.json")
+        print("Data Gen File:", data_gen_file)
+        with open(data_gen_file, "r") as f:
+            print(f.read())
+
+def main():
+    unittest.main()
+
+if __name__ == '__main__':
+    main()

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ requires = [`
`5`	`5`	`build-backend = "hatchling.build"`
`6`	`6`	`[project]`
`7`	`7`	`name = "itp_interface"`
`8`		`-version = "1.1.1"`
	`8`	`+version = "1.1.2"`
`9`	`9`	`authors = [`
`10`	`10`	`{ name="Amitayush Thakur", email="[email protected]" },`
`11`	`11`	`]`