diff --git a/src/somef/parser/composer_parser.py b/src/somef/parser/composer_parser.py index 5a122f87..54a413bf 100644 --- a/src/somef/parser/composer_parser.py +++ b/src/somef/parser/composer_parser.py @@ -1,7 +1,6 @@ import json import logging import os -import re from pathlib import Path from ..process_results import Result from ..utils import constants diff --git a/src/somef/parser/julia_parser.py b/src/somef/parser/julia_parser.py new file mode 100644 index 00000000..0e4eaa08 --- /dev/null +++ b/src/somef/parser/julia_parser.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- +import tomli +from pathlib import Path +import re +import logging +from somef.process_results import Result +from somef.utils import constants + +def parse_project_toml(file_path, metadata_result: Result, source): + """ + Parse a Project.toml file to extract metadata. + + Parameters + ---------- + file_path: path to the Project.toml file being analyzed + metadata_result: Metadata object dictionary + source: source of the package file (URL) + + Returns + ------- + """ + try: + if Path(file_path).name in ["Project.toml"]: + metadata_result.add_result( + constants.CAT_HAS_PACKAGE_FILE, + { + "value": "Project.toml", + "type": constants.URL, + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + with open(file_path, "rb") as f: + data = tomli.load(f) + + if "name" in data: + metadata_result.add_result( + constants.CAT_PACKAGE_ID, + { + "value": data["name"], + "type": constants.STRING + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + if "compat" in data: + compat = data["compat"] + for package_name, version in compat.items(): + metadata_result.add_result( + constants.CAT_RUNTIME_PLATFORM, + { + "value": f"{package_name}", + "package": package_name, + "version": version, + "type": constants.STRING + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + if "version" in data: + metadata_result.add_result( + constants.CAT_VERSION, + { + "value": data["version"], + "type": constants.STRING + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + if "uuid" in data: + metadata_result.add_result( + constants.CAT_IDENTIFIER, + { + "value": data["uuid"], + "type": constants.STRING + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + if "deps" in data: + deps = data["deps"] + for req in deps.keys(): + metadata_result.add_result( + constants.CAT_REQUIREMENTS, + { + "value": req, + "type": constants.STRING + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + if "authors" in data: + authors = data["authors"] + for auth in authors: + match = re.match(r'^(.+?)\s*<(.+?)>$', auth.strip()) + + if match: + author_name = match.group(1).strip() + author_email = match.group(2).strip() + + metadata_result.add_result( + constants.CAT_AUTHORS, + { + "value": author_name, + "name": author_name, + "email": author_email, + "type": constants.AGENT + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + else: + metadata_result.add_result( + constants.CAT_AUTHORS, + { + "value": auth.strip(), + "name": auth.strip(), + "type": constants.AGENT + }, + 1, + constants.TECHNIQUE_CODE_CONFIG_PARSER, + source + ) + + + except Exception as e: + logging.error(f"Error parsing Project.toml file {file_path}: {str(e)}") + + return metadata_result \ No newline at end of file diff --git a/src/somef/process_files.py b/src/somef/process_files.py index c7ecc8ef..33e96d67 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -5,7 +5,7 @@ import yaml import string from urllib.parse import urlparse -from .utils import constants, markdown_utils +from .utils import constants from . import extract_ontologies, extract_workflows from .process_results import Result from .regular_expressions import detect_license_spdx, extract_scholarly_article_natural, extract_scholarly_article_properties @@ -21,6 +21,7 @@ from .parser.bower_parser import parse_bower_json_file from .parser.gemspec_parser import parse_gemspec_file from .parser.description_parser import parse_description_file +from somef.test.julia_parser import parse_project_toml from chardet import detect @@ -235,8 +236,8 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner if filename.lower() == "pom.xml" or filename.lower() == "package.json" or \ filename.lower() == "pyproject.toml" or filename.lower() == "setup.py" or filename.endswith(".gemspec") or \ filename.lower() == "requirements.txt" or filename.lower() == "bower.json" or filename == "DESCRIPTION" or \ - (filename.lower() == "cargo.toml" and repo_relative_path == ".") or (filename.lower() == "composer.json" and repo_relative_path == "."): - + (filename.lower() == "cargo.toml" and repo_relative_path == ".") or (filename.lower() == "composer.json" and repo_relative_path == ".") or \ + (filename == "Project.toml" and repo_relative_path == "."): if filename.lower() in parsed_build_files and repo_relative_path != ".": logging.info(f"Ignoring secondary {filename} in {dir_path}") continue @@ -253,7 +254,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner }, 1, constants.TECHNIQUE_FILE_EXPLORATION, build_file_url) - logging.info(f"############### Processing package file: {filename} ############### ") + logging.info(f"############### (NEW UPDATE) Processing package file: {filename} ############### ") if filename.lower() == "pom.xml": metadata_result = parse_pom_file(os.path.join(dir_path, filename), metadata_result, build_file_url) if filename.lower() == "package.json": @@ -274,6 +275,8 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner metadata_result = parse_gemspec_file(os.path.join(dir_path, filename), metadata_result, build_file_url) if filename == "DESCRIPTION": metadata_result = parse_description_file(os.path.join(dir_path, filename), metadata_result, build_file_url) + if filename == "Project.toml": + metadata_result = parse_project_toml(os.path.join(dir_path, filename), metadata_result, build_file_url) parsed_build_files.add(filename.lower()) diff --git a/src/somef/test/test_data/repositories/Flux.jl/Project.toml b/src/somef/test/test_data/repositories/Flux.jl/Project.toml new file mode 100644 index 00000000..2458862f --- /dev/null +++ b/src/somef/test/test_data/repositories/Flux.jl/Project.toml @@ -0,0 +1,71 @@ +name = "Flux" +uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" +version = "0.16.5" + +[deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MLCore = "c2834f40-e789-41da-a90e-33b280584a8c" +MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" +MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" +MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" +Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" +Preferences = "21216c6a-2e73-6563-6e65-726566657250" +ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" + +[weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" +cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" + +[extensions] +FluxAMDGPUExt = "AMDGPU" +FluxCUDAExt = "CUDA" +FluxCUDAcuDNNExt = ["CUDA", "cuDNN"] +FluxEnzymeExt = "Enzyme" +FluxMPIExt = "MPI" +FluxMPINCCLExt = ["CUDA", "MPI", "NCCL"] + +[compat] +AMDGPU = "1, 2" +Adapt = "4" +CUDA = "5" +ChainRulesCore = "1.12" +Compat = "4.10.0" +Enzyme = "0.13" +EnzymeCore = "0.7.7, 0.8.4" +Functors = "0.5" +MLCore = "1.0.0" +MLDataDevices = "1.4.2" +MLUtils = "0.4" +MPI = "0.20.19" +MacroTools = "0.5" +NCCL = "0.1.1" +NNlib = "0.9.22" +OneHotArrays = "0.2.4" +Optimisers = "0.4.1" +Preferences = "1" +ProgressLogging = "0.1" +Reexport = "1.0" +Setfield = "1.1" +SpecialFunctions = "2.1.2" +Statistics = "1" +Zygote = "0.6.67, 0.7" +cuDNN = "1" +julia = "1.10" diff --git a/src/somef/test/test_data/repositories/Pluto.jl/Project.toml b/src/somef/test/test_data/repositories/Pluto.jl/Project.toml new file mode 100644 index 00000000..74725411 --- /dev/null +++ b/src/somef/test/test_data/repositories/Pluto.jl/Project.toml @@ -0,0 +1,84 @@ +name = "Pluto" +uuid = "c3e4b0f8-55cb-11ea-2926-15256bba5781" +license = "MIT" +authors = ["Fons van der Plas "] +version = "0.20.20" + +[deps] +Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" +Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +ExpressionExplorer = "21656369-7473-754a-2065-74616d696c43" +FileWatching = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" +GracefulPkg = "828d9ff0-206c-6161-646e-6576656f7244" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +HypertextLiteral = "ac1192a8-f4b3-4bfe-ba22-af5b92cd3ab2" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" +MIMEs = "6c6e2e6c-3030-632d-7369-2d6c69616d65" +Malt = "36869731-bdee-424d-aa32-cab38c994e3b" +Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +MsgPack = "99f44e22-a591-53d1-9472-aa23ef4bd671" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +PlutoDependencyExplorer = "72656b73-756c-7461-726b-72656b6b696b" +PrecompileSignatures = "91cefc8d-f054-46dc-8f8c-26e11d7c5411" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" +REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +RegistryInstances = "2792f1a3-b283-48e8-9a74-f99dce5104f3" +RelocatableFolders = "05181044-ff0b-4ac5-8273-598c1e38db00" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" +Scratch = "6c6a2e73-6563-6170-7368-637461726353" +Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[compat] +Base64 = "1" +Configurations = "0.15, 0.16, 0.17" +Dates = "0, 1" +Downloads = "1" +ExpressionExplorer = "0.5, 0.6, 1" +FileWatching = "1" +GracefulPkg = "2" +HTTP = "^1.10.17" +HypertextLiteral = "0.7, 0.8, 0.9" +InteractiveUtils = "1" +LRUCache = "1.6.2" +Logging = "1" +LoggingExtras = "0.4, 1" +MIMEs = "0.1, 1" +Malt = "1.1" +Markdown = "1" +MsgPack = "1.1" +Pkg = "1" +PlutoDependencyExplorer = "~1.2" +PrecompileSignatures = "3" +PrecompileTools = "~1.2, ~1.3" +REPL = "1" +RegistryInstances = "0.1" +RelocatableFolders = "0.1, 0.2, 0.3, 1" +SHA = "0.7, 1" +Scratch = "1.1" +Sockets = "1" +TOML = "1" +Tables = "1" +URIs = "1.3" +UUIDs = "1" +julia = "^1.10" + +[extras] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" + +[targets] +test = ["DataFrames", "OffsetArrays", "Sockets", "Test", "TimerOutputs", "Memoize"] diff --git a/src/somef/test/test_julia_parser.py b/src/somef/test/test_julia_parser.py new file mode 100644 index 00000000..798cfa11 --- /dev/null +++ b/src/somef/test/test_julia_parser.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +import unittest +import os +from pathlib import Path + +from somef.process_results import Result +from somef.parser.julia_parser import parse_project_toml +from somef.utils import constants + +test_data_path = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep + + +class TestJuliaParser(unittest.TestCase): + + def test_parse_pluto_project_toml(self): + """Test parsing Pluto's Project.toml file""" + project_file_path = test_data_path + "Pluto.jl" + os.path.sep + "Project.toml" + result = Result() + + metadata_result = parse_project_toml(project_file_path, result, "http://example.com/repo1/Project.toml") + + package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) + self.assertTrue(len(package_results) > 0, "No package file info found") + self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + self.assertEqual(package_results[0]["result"]["type"], constants.URL) + self.assertEqual(package_results[0]["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER) + + package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) + self.assertTrue(len(package_id_results) > 0, "No package ID found") + self.assertEqual(package_id_results[0]["result"]["value"], "Pluto") + self.assertEqual(package_id_results[0]["result"]["type"], constants.STRING) + + version_results = metadata_result.results.get(constants.CAT_VERSION, []) + self.assertTrue(len(version_results) > 0, "No version found") + self.assertEqual(version_results[0]["result"]["value"], "0.20.20") + self.assertEqual(version_results[0]["result"]["type"], constants.STRING) + + identifier_results = metadata_result.results.get(constants.CAT_IDENTIFIER, []) + self.assertTrue(len(identifier_results) > 0, "No identifier found") + self.assertEqual(identifier_results[0]["result"]["value"], "c3e4b0f8-55cb-11ea-2926-15256bba5781") + self.assertEqual(identifier_results[0]["result"]["type"], constants.STRING) + + author_results = metadata_result.results.get(constants.CAT_AUTHORS, []) + self.assertTrue(len(author_results) > 0, "No author found") + self.assertEqual(author_results[0]["result"]["name"], "Fons van der Plas") + self.assertEqual(author_results[0]["result"]["email"], "fons@plutojl.org") + self.assertEqual(author_results[0]["result"]["type"], constants.AGENT) + + requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, []) + self.assertTrue(len(requirements_results) > 0, "No dependencies found") + + dep_values = [req["result"]["value"] for req in requirements_results] + self.assertIn("HTTP", dep_values) + self.assertIn("Markdown", dep_values) + self.assertIn("Pkg", dep_values) + self.assertIn("REPL", dep_values) + + for req in requirements_results: + self.assertEqual(req["result"]["type"], constants.STRING) + self.assertEqual(req["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER) + + runtime_results = metadata_result.results.get(constants.CAT_RUNTIME_PLATFORM, []) + self.assertTrue(len(runtime_results) > 0, "No runtime platform info found") + + # Random check for runtime platforms with versions + runtime_dict = {r["result"]["package"]: r["result"]["version"] for r in runtime_results} + self.assertIn("HTTP", runtime_dict) + self.assertEqual(runtime_dict["HTTP"], "^1.10.17") + self.assertIn("julia", runtime_dict) + self.assertEqual(runtime_dict["julia"], "^1.10") + + for runtime in runtime_results: + self.assertEqual(runtime["result"]["type"], constants.STRING) + self.assertEqual(runtime["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER) + + def test_parse_flux_project_toml(self): + """Test parsing Flux's Project.toml file""" + project_file_path = test_data_path + "Flux.jl" + os.path.sep + "Project.toml" + result = Result() + + metadata_result = parse_project_toml(project_file_path, result, "http://example.com/repo2/Project.toml") + + package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) + self.assertTrue(len(package_results) > 0, "No package file info found") + self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + + package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) + self.assertTrue(len(package_id_results) > 0, "No package ID found") + self.assertEqual(package_id_results[0]["result"]["value"], "Flux") + self.assertEqual(package_id_results[0]["result"]["type"], constants.STRING) + + version_results = metadata_result.results.get(constants.CAT_VERSION, []) + self.assertTrue(len(version_results) > 0, "No version found") + self.assertEqual(version_results[0]["result"]["value"], "0.16.5") + self.assertEqual(version_results[0]["result"]["type"], constants.STRING) + + identifier_results = metadata_result.results.get(constants.CAT_IDENTIFIER, []) + self.assertTrue(len(identifier_results) > 0, "No identifier found") + self.assertEqual(identifier_results[0]["result"]["value"], "587475ba-b771-5e3f-ad9e-33799f191a9c") + self.assertEqual(identifier_results[0]["result"]["type"], constants.STRING) + + author_results = metadata_result.results.get(constants.CAT_AUTHORS, []) + self.assertEqual(len(author_results), 0, "No authors should be found in Flux's Project.toml") + + requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, []) + self.assertTrue(len(requirements_results) > 0, "No dependencies found") + + dep_values = [req["result"]["value"] for req in requirements_results] + self.assertIn("Zygote", dep_values) + self.assertIn("NNlib", dep_values) + self.assertIn("Optimisers", dep_values) + self.assertIn("LinearAlgebra", dep_values) + + runtime_results = metadata_result.results.get(constants.CAT_RUNTIME_PLATFORM, []) + self.assertTrue(len(runtime_results) > 0, "No runtime platform info found") + + # Random check for runtime platforms with versions + runtime_dict = {r["result"]["package"]: r["result"]["version"] for r in runtime_results} + self.assertIn("julia", runtime_dict) + self.assertEqual(runtime_dict["julia"], "1.10") + self.assertIn("NNlib", runtime_dict) + self.assertEqual(runtime_dict["NNlib"], "0.9.22") + self.assertIn("Zygote", runtime_dict) + self.assertEqual(runtime_dict["Zygote"], "0.6.67, 0.7") + +if __name__ == "__main__": + unittest.main() \ No newline at end of file