Skip to content

Commit 112f4f2

Browse files
authored
Merge pull request #548 from aboutcode-org/matchcode-update
Add fragment matching to MatchCode
2 parents d1b5c11 + e3b6dba commit 112f4f2

19 files changed

+1330
-193
lines changed

Makefile

+26-16
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,18 @@ ACTIVATE?=. ${VENV}/bin/activate;
1616
VIRTUALENV_PYZ=../etc/thirdparty/virtualenv.pyz
1717
# Do not depend on Python to generate the SECRET_KEY
1818
GET_SECRET_KEY=`base64 /dev/urandom | head -c50`
19+
1920
# Customize with `$ make envfile ENV_FILE=/etc/purldb/.env`
2021
ENV_FILE=.env
22+
2123
# Customize with `$ make postgres PACKAGEDB_DB_PASSWORD=YOUR_PASSWORD`
2224
PACKAGEDB_DB_PASSWORD=packagedb
2325
MATCHCODEIO_DB_PASSWORD=matchcodeio
26+
SCANCODEIO_DB_PASSWORD=scancodeio
27+
28+
# Django settings shortcuts
29+
DJSM_PDB=DJANGO_SETTINGS_MODULE=purldb_project.settings
30+
DJSM_MAT=DJANGO_SETTINGS_MODULE=matchcode_project.settings
2431

2532
# Use sudo for postgres, but only on Linux
2633
UNAME := $(shell uname)
@@ -56,22 +63,17 @@ envfile_testing: envfile
5663
@echo SCANCODEIO_DB_USER=\"postgres\" >> ${ENV_FILE}
5764
@echo SCANCODEIO_DB_PASSWORD=\"postgres\" >> ${ENV_FILE}
5865

59-
doc8:
60-
@echo "-> Run doc8 validation"
61-
@${ACTIVATE} doc8 --max-line-length 100 --ignore-path docs/_build/ --quiet docs/
62-
6366
valid:
6467
@echo "-> Run Ruff format"
6568
@${ACTIVATE} ruff format --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/
6669
@echo "-> Run Ruff linter"
6770
@${ACTIVATE} ruff check --fix --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/
6871

69-
check:
72+
check: check_docs
7073
@echo "-> Run Ruff linter validation (pycodestyle, bandit, isort, and more)"
7174
@${ACTIVATE} ruff check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/
7275
@echo "-> Run Ruff format validation"
7376
@${ACTIVATE} ruff format --check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/
74-
@$(MAKE) doc8
7577

7678
clean:
7779
@echo "-> Clean the Python env"
@@ -102,7 +104,7 @@ postgres_matchcodeio:
102104
@echo "-> Create 'matchcodeio' database"
103105
${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=matchcodeio matchcodeio
104106
${MATCHCODE_MANAGE} migrate
105-
107+
106108
run:
107109
${MANAGE} runserver 8001 --insecure
108110

@@ -118,13 +120,20 @@ run_visit: seed
118120
run_map:
119121
${MANAGE} run_map
120122

121-
test:
122-
@echo "-> Run the test suite"
123-
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode_pipeline --ignore matchcode_project --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py
124-
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs packagedb/tests/test_throttling.py
125-
${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcode_project.settings ${PYTHON_EXE} -m pytest -vvs matchcode_pipeline
126-
${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs purldb-toolkit/
127-
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs purl2vcs
123+
test_purldb:
124+
${ACTIVATE} ${DJSM_PDB} pytest -vvs --lf minecode packagedb purl2vcs purldb_project purldb_public_project --ignore packagedb/tests/test_throttling.py
125+
${ACTIVATE} ${DJSM_PDB} pytest -vvs --lf packagedb/tests/test_throttling.py
126+
127+
test_toolkit:
128+
${ACTIVATE} pytest -vvs purldb-toolkit/
129+
130+
test_clearcode:
131+
${ACTIVATE} ${DJSM_PDB} ${PYTHON_EXE} -m pytest -vvs clearcode clearindex
132+
133+
test_matchcode:
134+
${ACTIVATE} ${DJSM_MAT} ${PYTHON_EXE} -m pytest -vvs matchcode_pipeline matchcode-toolkit matchcode
135+
136+
test: test_purldb test_matchcode test_toolkit test_clearcode
128137

129138
shell:
130139
${MANAGE} shell
@@ -153,7 +162,7 @@ check_docs:
153162
@echo "Check Sphinx Documentation build minimally"
154163
@${ACTIVATE} sphinx-build -E -W docs/source build
155164
@echo "Check for documentation style errors"
156-
@${ACTIVATE} doc8 --max-line-length 100 docs/source --ignore D000 --quiet
165+
@${ACTIVATE} doc8 --max-line-length 100 docs/source --ignore-path docs/_build/ --ignore D000 --quiet
157166

158167
docker-images:
159168
@echo "-> Build Docker services"
@@ -164,4 +173,5 @@ docker-images:
164173
@mkdir -p dist/
165174
@docker save minecode minecode_minecode nginx | gzip > dist/minecode-images-`git describe --tags`.tar.gz
166175

167-
.PHONY: virtualenv conf dev envfile isort black doc8 valid check clean migrate postgres run test shell clearsync clearindex index_packages bump docs docker-images
176+
# keep this sorted
177+
.PHONY: black bump check check_docs clean `clearindex clearsync conf dev docker-images docs envfile envfile_testing index_packages isort migrate postgres postgres_matchcodeio priority_queue run run_map run_matchcodeio run_visit seed shell test test_clearcode test_matchcode test_purldb test_toolkit valid virtualenv

README.rst

+12-13
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,23 @@
1+
==========
12
The purldb
23
==========
3-
This repo consists of these main tools:
44

5-
- PackageDB that is the reference model (based on ScanCode toolkit)
6-
that contains package data with purl (Package URLs) being a first
7-
class citizen.
5+
This purldb consists of these main tools:
6+
7+
- PackageDB that is the reference model (based on ScanCode toolkit) that contains package data with
8+
PURL (Package URLs) being a first class citizen.
89
- MineCode that contains utilities to mine package repositories
9-
- MatchCode that contains utilities to index package metadata and resources for
10-
matching
10+
- MatchCode that contains utilities to index package metadata and resources for matching
1111
- MatchCode.io that provides package matching functionalities for codebases
1212
- ClearCode that contains utilities to mine Clearlydefined for package data
13-
- purldb-toolkit CLI utility and library to use the PurlDB, its API and various
14-
related libraries.
13+
- purldb-toolkit CLI utility and library to use the PurlDB, its API and various related libraries.
1514

16-
These are designed to be used first for reference such that one can query for
17-
packages by purl and validate purl existence.
15+
These are designed to be used first for reference such that one can query for packages by purl and
16+
validate purl existence.
1817

19-
In the future, the collected packages will be used as reference for dependency
20-
resolution, as a reference knowledge base for all package data, as a reference
21-
for vulnerable range resolution and more.
18+
In the future, the collected packages will be used as reference for dependency resolution, as a
19+
reference knowledge base for all package data, as a reference for vulnerable range resolution and
20+
more.
2221

2322
Documentation
2423
-------------

azure-pipelines.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
image_name: ubuntu-22.04
4040
python_versions: ['3.10']
4141
test_suites:
42-
all: make check_docs
42+
all: make docs check_docs
4343

4444
- template: etc/ci/azure-posix.yml
4545
parameters:

docs/source/how-to-guides/deploy_to_devel.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ directly or through a command line purlcli tool sub-command.
1313

1414
.. note::
1515
This tutorial assumes that you have a working installation of PurlDB and MatchCode.io
16-
If you don't, please refer to the `installation <../purldb/overview.html#installation>`_ page.
16+
If you don't, please refer to the `installation <../how-to-guides/installation.html#installation>`_ page.
1717

1818

1919
Why mapping binary back to sources?

docs/source/how-to-guides/installation.rst

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1+
============
12
Installation
23
============
34

4-
This article will detail the steps needed to set up a PurlDB instance with
5-
MatchCode and package scanning using Docker.
5+
This article details the steps needed to set up a PurlDB instance with MatchCode and package
6+
scanning using Docker.
67

7-
MatchCode.io requires that it is installed and running alongside an instance of
8-
PurlDB, as it needs direct access to PurlDB's data. This is done by running the
9-
PurlDB and MatchCode.io services on the same Docker network.
8+
MatchCode.io requires that it is installed and running alongside an instance of PurlDB, as it needs
9+
direct access to PurlDB's data. This is done by running the PurlDB and MatchCode.io services on the
10+
same Docker network.
1011

1112
.. code-block:: console
1213

docs/source/how-to-guides/symbols_and_strings.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ from codebase resources.
99

1010
.. note::
1111
This tutorial assumes that you have a working installation of PurlDB.
12-
If you don't, please refer to the `installation <../purldb/overview.html#installation>`_ page.
12+
If you don't, please refer to the `installation <../how-to-guides/installation.html#installation>`_ page.
1313

1414

1515
Problem

docs/source/index.rst

+9-7
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
1-
Welcome to PurlDB documentation!
1+
=========================================
2+
Welcome to PurlDB documentation!
23
=========================================
34

45

56
PurlDB aka. ``Package URL Database`` is a database of software package metadata keyed by Package-URL
67
or purl that offers information and indentication services about software packages.
78

8-
A purl or Package-URL is an attempt to standardize existing approaches to reliably identify and
9+
A PURL or Package-URL is an attempt to standardize existing approaches to reliably identify and
910
locate software packages in general and Free and Open Source Software (FOSS) packages in
1011
particular.
1112

12-
A purl is a URL string used to identify and locate a software package in a mostly universal and
13+
A PURL is a URL string used to identify and locate a software package in a mostly universal and
1314
uniform way across programming languages, package managers, packaging conventions, tools, APIs and
1415
databases.
1516

1617
Why PurlDB?
1718
------------------
1819

19-
Modern software is assembled from 100's or 1000's of FOSS packages: being able to catalog these,
20-
normalize their metadata, track their versions, licenses and dependencies and being able to locate
21-
and identify them is essential to healthy, sustainable and secure modern software development.
20+
Modern software is assembled from 1000's of FOSS packages: being able to catalog these, normalize
21+
their metadata, track their versions, licenses and dependencies and being able to discover, locate
22+
and identify them as used in a codebase is essential to healthy, sustainable and secure modern
23+
software development.
2224

2325
This what PurlDB is all about and it offers:
2426

@@ -54,7 +56,7 @@ What's in PurlDB?
5456
The PurlDB project consists of these main tools:
5557

5658
- PackageDB that is the database and reference model (based on ScanCode toolkit)
57-
that contains package data with purl (Package URLs) being a first class citizen and the primaty
59+
that contains package data with PURL (Package URLs) being a first class citizen and the primaty
5860
key to access information.
5961

6062
- MineCode that contains utilities to mine package repositories and populate the PackageDB

docs/source/matchcode/index.rst

+20-15
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
Matchcode
2-
==========
1+
===========
2+
Matchcode
3+
===========
34

45
MatchCode.io
56
------------
@@ -8,12 +9,16 @@ MatchCode.io is a Django app, based off of ScanCode.io, that exposes one API
89
endpoint, ``api/matching``, which takes a ScanCode.io codebase scan, and
910
performs Package matching on it.
1011

11-
Currently, it performs three matching steps:
12+
Currently, it performs three matching steps, using the PackageDB indices:
1213

13-
* Match codebase resources against the Packages in the PackageDB
14-
* Match codebase resources against the Resources in the PackageDB
15-
* Match codebase directories against the directory matching indices of
16-
MatchCode
14+
* Match codebase files against whole Packages archives
15+
* Match exactly codebase files against files
16+
* Match codebase directories exactly and approximately against directory indices
17+
18+
Upcoming features include:
19+
20+
* Match codebase files approximately
21+
* Match codebase file fragments (aka. snippets) including attempting to match AI-generated code
1722

1823

1924
MatchCode.io API Endpoints
@@ -28,23 +33,23 @@ MatchCode.io API Endpoints
2833
Docker Setup for Local Development and Testing
2934
----------------------------------------------
3035

31-
PurlDB and MatchCode.io are two separate Django apps. In order to run both of
32-
these Django apps on the same host, we need to use Traefik.
36+
PurlDB and MatchCode.io are two separate Django apps. In order to run both of these Django apps on
37+
the same host, we need to use Traefik.
3338

34-
Traefik is an edge router that receives requests and finds out which services
35-
are responsible for handling them. In the docker-compose.yml files for PurlDB
36-
and MatchCode.io, we have made these two services part of the same Docker
37-
network and set up the routes for each service.
39+
Traefik is an edge router that receives requests and finds out which services are responsible for
40+
handling them. In the docker-compose.yml files for PurlDB and MatchCode.io, we have made these two
41+
services part of the same Docker network and set up the routes for each service.
3842

39-
All requests to the host go to the PurlDB service, but requests that go to the
40-
``api/matching`` endpoint are routed to the MatchCode.io service.
43+
All requests to the host go to the PurlDB service, but requests that go to the ``api/matching``
44+
endpoint are routed to the MatchCode.io service.
4145

4246
To run PurlDB and Matchcode.io with Docker:
4347
::
4448

4549
docker compose -f docker-compose.yml up -d
4650
docker compose -f docker-compose.matchcodeio.yml up -d
4751

52+
4853
Scancode.io pipeline
4954
---------------------
5055

docs/source/matchcode/matchcode-pipeline.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
Code Matching
2-
=============
1+
===========================
2+
Code Matching pipeline
3+
===========================
34

45
The aim of this tutorial is to show how to use the MatchCode.io API to perform
56
code matching on an archive of files.
67

78
.. note::
89
This tutorial assumes that you have a working installation of PurlDB. If you
910
don't, please refer to the `installation
10-
<../purldb/overview.html#installation>`_ page.
11+
<../how-to-guides/installation.html#installation>`_ page.
1112

1213
Throughout this tutorial, we will use ``pkg:npm/[email protected]`` and a
1314
modified copy of ``index.js`` from it.

matchcode/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from matchcode_toolkit.fingerprinting import create_halohash_chunks
1616
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
1717
from matchcode_toolkit.fingerprinting import split_fingerprint
18-
from matchcode_toolkit.halohash import byte_hamming_distance
1918
from rest_framework.decorators import action
2019
from rest_framework.response import Response
2120
from rest_framework.serializers import CharField
@@ -25,6 +24,7 @@
2524
from rest_framework.serializers import ReadOnlyField
2625
from rest_framework.serializers import Serializer
2726
from rest_framework.viewsets import ReadOnlyModelViewSet
27+
from samecode.halohash import byte_hamming_distance
2828

2929
from matchcode.models import ApproximateDirectoryContentIndex
3030
from matchcode.models import ApproximateDirectoryStructureIndex
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Generated by Django 5.1.2 on 2024-10-30 23:19
2+
3+
import django.db.models.deletion
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
("matchcode", "0002_alter_approximatedirectorycontentindex_package_and_more"),
11+
("packagedb", "0086_alter_party_name"),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name="SnippetIndex",
17+
fields=[
18+
(
19+
"id",
20+
models.AutoField(
21+
auto_created=True,
22+
primary_key=True,
23+
serialize=False,
24+
verbose_name="ID",
25+
),
26+
),
27+
(
28+
"fingerprint",
29+
models.BinaryField(
30+
db_index=True,
31+
help_text="Binary form of a snippet fingerprint",
32+
max_length=16,
33+
),
34+
),
35+
(
36+
"package",
37+
models.ForeignKey(
38+
help_text="The Package that this snippet fingerprint is from",
39+
on_delete=django.db.models.deletion.CASCADE,
40+
to="packagedb.package",
41+
),
42+
),
43+
(
44+
"resource",
45+
models.ForeignKey(
46+
help_text="The Package that this snippet fingerprint is from",
47+
on_delete=django.db.models.deletion.CASCADE,
48+
to="packagedb.resource",
49+
),
50+
),
51+
],
52+
),
53+
]

0 commit comments

Comments
 (0)