Skip to content

Refine ScanCode.io d2d pipeline for JavaScript using string literals mapping #1652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scanpipe/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ def filter(self, qs, value):
("about_file", "about file"),
("java_to_class", "java to class"),
("jar_to_source", "jar to source"),
("javascript_strings", "js strings"),
("javascript_symbols", "js symbols"),
("js_compiled", "js compiled"),
("js_colocation", "js colocation"),
Expand Down
6 changes: 6 additions & 0 deletions scanpipe/pipelines/deploy_to_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def steps(cls):
cls.map_jar_to_source,
cls.map_javascript,
cls.map_javascript_symbols,
cls.map_javascript_strings,
cls.map_elf,
cls.map_go,
cls.map_rust,
Expand Down Expand Up @@ -202,6 +203,11 @@ def map_javascript_symbols(self):
"""Map deployed JavaScript, TypeScript to its sources using symbols."""
d2d.map_javascript_symbols(project=self.project, logger=self.log)

@optional_step("JavaScript")
def map_javascript_strings(self):
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
d2d.map_javascript_strings(project=self.project, logger=self.log)

@optional_step("Elf")
def map_elf(self):
"""Map ELF binaries to their sources."""
Expand Down
84 changes: 84 additions & 0 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from scanpipe.pipes import purldb
from scanpipe.pipes import resolve
from scanpipe.pipes import scancode
from scanpipe.pipes import stringmap
from scanpipe.pipes import symbolmap
from scanpipe.pipes import symbols

Expand Down Expand Up @@ -2055,3 +2056,86 @@ def _map_javascript_symbols(to_resource, javascript_from_resources, logger):
to_resource.update(status=flag.MAPPED)
return 1
return 0


def map_javascript_strings(project, logger=None):
"""Map deployed JavaScript, TypeScript to its sources using string literals."""
project_files = project.codebaseresources.files()

javascript_to_resources = (
project_files.to_codebase()
.has_no_relation()
.filter(extension__in=[".ts", ".js"])
.exclude(extra_data={})
)

javascript_from_resources = (
project_files.from_codebase()
.exclude(path__contains="/test/")
.filter(extension__in=[".ts", ".js"])
.exclude(extra_data={})
)

if not (javascript_from_resources.exists() and javascript_to_resources.exists()):
return

javascript_from_resources_count = javascript_from_resources.count()
javascript_to_resources_count = javascript_to_resources.count()
if logger:
logger(
f"Mapping {javascript_to_resources_count:,d} JavaScript resources"
f" using string literals against {javascript_from_resources_count:,d}"
" from/ resources."
)

resource_iterator = javascript_to_resources.iterator(chunk_size=2000)
progress = LoopProgress(javascript_to_resources_count, logger)

resource_mapped = 0
for to_resource in progress.iter(resource_iterator):
resource_mapped += _map_javascript_strings(
to_resource, javascript_from_resources, logger
)
if logger:
logger(f"{resource_mapped:,d} resource mapped using strings")


def _map_javascript_strings(to_resource, javascript_from_resources, logger):
"""
Map a deployed JavaScript resource to its source using string literals and
return 1 if match is found otherwise return 0.
"""
ignoreable_string_threshold = 5
to_strings = to_resource.extra_data.get("source_strings")
to_strings_set = set(to_strings)

if not to_strings or len(to_strings_set) < ignoreable_string_threshold:
return 0

best_matching_score = 0
best_match = None
for source_js in javascript_from_resources:
from_strings = source_js.extra_data.get("source_strings")
from_strings_set = set(from_strings)
if not from_strings or len(from_strings_set) < ignoreable_string_threshold:
continue

is_match, similarity = stringmap.match_source_strings_to_deployed(
source_strings=from_strings,
deployed_strings=to_strings,
)

if is_match and similarity > best_matching_score:
best_matching_score = similarity
best_match = source_js

if best_match:
pipes.make_relation(
from_resource=best_match,
to_resource=to_resource,
map_type="javascript_strings",
extra_data={"js_string_map_score": similarity},
)
to_resource.update(status=flag.MAPPED)
return 1
return 0
65 changes: 65 additions & 0 deletions scanpipe/pipes/stringmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from collections import Counter

STRING_MATCHING_RATIO_JAVASCRIPT = 0.7
SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT = 10
STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE = 0.5


def match_source_strings_to_deployed(source_strings, deployed_strings):
"""
Compute the similarity between source and deployed string literals and
return whether they match based on matching threshold.
"""
common_strings_ratio = 0
is_match = False
deployed_strings_set = set(deployed_strings)
source_strings_set = set(source_strings)
source_strings_count = len(source_strings)
deployed_strings_count = len(deployed_strings)
total_strings_count = source_strings_count + deployed_strings_count
source_strings_counter = Counter(source_strings)
deployed_strings_counter = Counter(deployed_strings)

common_strings = source_strings_set.intersection(deployed_strings_set)
total_common_strings_count = sum(
[
source_strings_counter.get(string, 0)
+ deployed_strings_counter.get(string, 0)
for string in common_strings
]
)

if total_common_strings_count:
common_strings_ratio = total_common_strings_count / total_strings_count

if common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT:
is_match = True
elif (
source_strings_count > SMALL_FILE_STRING_THRESHOLD_JAVASCRIPT
and common_strings_ratio > STRING_MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
):
is_match = True

return is_match, common_strings_ratio
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
about_resource: cesium
name: cesium
version: 1.125
download_url: https://github.com/CesiumGS/cesium/archive/refs/tags/1.125.zip
homepage_url: https://github.com/CesiumGS/cesium
license_expression: apache-2.0
attribute: yes
package_url: pkg:github/CesiumGS/[email protected]

Large diffs are not rendered by default.

Loading
Loading