Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions samples/julia/hello.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
println("Hello Julia")
155 changes: 106 additions & 49 deletions src/summarycode/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,109 @@
from commoncode.fileutils import file_name
from commoncode.fileutils import file_base_name

def get_dynamic_manifest_ends():
"""
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See how we now have a package manifest patterns index with https://github.com/aboutcode-org/scancode-toolkit/pull/4606/changes#diff-13120b0eb8c69b520b66229f7090c12b1102859a76ddd19b4789de2ed1b8818cR85-R100, can you reuse this to do a fast manifest pattern check directly?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed review and for pointing out the new package manifest patterns index. That makes a lot of sense. I will spend some time reviewing the new architecture you linked and start restructuring my approach to align with it. I'll update the PR once I have a new working draft.

Return a tuple of manifest file endings dynamically extracted
from the registered package datafile handlers.
"""
# local import, it breaks the circular dependency loop
from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS

# Seed the set with the original legacy list to appease old, rigid tests
manifest_ends = set([
'package.json',
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you have this list, this is not dynamically extracted, this just replaces one static list with another, even though you have additional checks.

'pom.xml',
'.nuspec',
'.podspec',
'about.json',
'.ABOUT',
'npm-shrinkwrap.json',
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
'composer.lock',
'composer.json',
'Cargo.toml',
'Cargo.lock',
'Gemfile.lock',
'Gemfile',
'mix.lock',
'mix.exs',
'setup.py',
'pyproject.toml',
'Pipfile.lock',
'Pipfile',
'poetry.lock',
'requirements.txt',
'.conda/environments.txt',
'conanfile.txt',
'conanfile.py',
'DESCRIPTION',
'META.yml',
'META.json',
'Makefile.PL',
'Build.PL',
'cpanfile',
'cpanfile.snapshot',
'go.mod',
'go.sum',
'Gopkg.toml',
'Gopkg.lock',
'Godeps.json',
'vendor.json',
'glide.yaml',
'glide.lock',
'pom.xml',
'build.gradle',
'build.gradle.kts',
'pom.xml',
'.ivy.xml',
'ivy.xml',
'build.sbt',
'build.scala',
'project.clj',
'elm-package.json',
'elm.json',
'rebar.config',
'rebar.config.script',
'rebar.lock',
'shard.yml',
'shard.override.yml',
'shard.lock',
'pubspec.yaml',
'pubspec.lock',
'vcpkg.json',
'Package.swift',
'Package.resolved',
'project.clj',
'metadata',
'pom.xml',
'build.gradle',
'build.gradle.kts',
'.ivy.xml',
'ivy.xml',
'build.sbt',
'build.scala',
'project.clj',
])

for handler in APPLICATION_PACKAGE_DATAFILE_HANDLERS:
# Safely get the path_patterns attribute
path_patterns = getattr(handler, 'path_patterns', tuple())

# Prevent the "String Iteration Bug" if a handler missed a trailing comma
if isinstance(path_patterns, str):
path_patterns = (path_patterns,)

for pattern in path_patterns:
clean_pattern = pattern.lstrip('*/').lower()
if clean_pattern:
manifest_ends.add(clean_pattern)

# str.endswith() in Python requires a tuple, so set is converted before returning
return tuple(manifest_ends)


def get_relative_path(root_path, path):
"""
Return a path relativefrom the posix 'path' relative to a
Expand Down Expand Up @@ -42,53 +145,6 @@ def get_relative_path(root_path, path):
'patents',
)

_MANIFEST_ENDS = {
'.about': 'ABOUT file',
'/bower.json': 'bower',
'/project.clj': 'clojure',
'.podspec': 'cocoapod',
'/composer.json': 'composer',
'/description': 'cran',
'/elm-package.json': 'elm',
'/+compact_manifest': 'freebsd',
'+manifest': 'freebsd',
'.gemspec': 'gem',
'/metadata': 'gem',
# the extracted metadata of a gem archive
'/metadata.gz-extract': 'gem',
'/build.gradle': 'gradle',
'/project.clj': 'clojure',
'.pom': 'maven',
'/pom.xml': 'maven',

'.cabal': 'haskell',
'/haxelib.json': 'haxe',
'/package.json': 'npm',
'.nuspec': 'nuget',
'.pod': 'perl',
'/meta.yml': 'perl',
'/dist.ini': 'perl',

'/pipfile': 'pypi',
'/setup.cfg': 'pypi',
'/setup.py': 'pypi',
'/PKG-INFO': 'pypi',
'/pyproject.toml': 'pypi',
'.spec': 'rpm',
'/cargo.toml': 'rust',
'.spdx': 'spdx',
'/dependencies': 'generic',

# note that these two cannot be top-level for now
'debian/copyright': 'deb',
'meta-inf/manifest.mf': 'maven',

# TODO: Maven also has sometimes a pom under META-INF/
# 'META-INF/manifest.mf': 'JAR and OSGI',

}

MANIFEST_ENDS = tuple(_MANIFEST_ENDS)

README_STARTS_ENDS = (
'readme',
Expand Down Expand Up @@ -165,17 +221,18 @@ def check_resource_name_start_and_end(resource, STARTS_ENDS):
or base_name.endswith(STARTS_ENDS)
)


def set_classification_flags(resource,
_LEGAL=LEGAL_STARTS_ENDS,
_MANIF=MANIFEST_ENDS,
_README=README_STARTS_ENDS,
):
"""
Set classification flags on the `resource` Resource.
"""
path = resource.path.lower()

#This prevents the global circular import crash
_MANIF = get_dynamic_manifest_ends()

resource.is_legal = is_legal = check_resource_name_start_and_end(resource, _LEGAL)
resource.is_readme = is_readme = check_resource_name_start_and_end(resource, _README)
resource.is_community = check_is_resource_community_file(resource)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@
"is_source": true,
"is_script": false,
"is_legal": true,
"is_manifest": false,
"is_manifest": true,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is wrong, not a manifest.

The path of copyright files should be checked in it's entirety to validate if this is a debian copyright file and only then this can be classified as manifests.

We possibly need to maintain a reject list of datafile handlers which should be ignored while checking if a file is manifest or not, because:

  • sometimes path pattern is not enough and is_datafile() checks happen in functions, which either performs more checks or opens the files

"is_readme": false,
"is_top_level": true,
"is_key_file": true,
Expand Down