From bfa8e9a3f5180ebd12a5a86a66e1ceade401dc4b Mon Sep 17 00:00:00 2001 From: dikshaa2909 Date: Tue, 17 Feb 2026 19:06:34 +0530 Subject: [PATCH] Fix standalone year-only copyright detection without holder Signed-off-by: dikshaa2909 --- src/cluecode/copyrights.py | 58 ++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..fe5b59a70d 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -326,8 +326,9 @@ def detect(self, 'IS', 'HELD', ]) - # then walk the parse parse_tree, collecting copyrights, years and authors + # walk parse tree for tree_node in parse_tree: + if not isinstance(tree_node, Tree): if TRACE: logger_debug(f'CopyrightDetector: parse_tree node: {tree_node}') @@ -336,6 +337,7 @@ def detect(self, tree_node_label = tree_node.label if (include_copyrights or include_holders) and 'COPYRIGHT' in tree_node_label: + copyrght = build_detection_from_node( node=tree_node, cls=CopyrightDetection, @@ -344,40 +346,41 @@ def detect(self, refiner=refine_copyright, ) - if TRACE or TRACE_DEEP: - logger_debug(f'CopyrightDetector: final copyright: {copyrght}') + if not copyrght: + continue - if copyrght: - if include_copyrights: - yield copyrght + holder = None + if include_holders: + holder = build_detection_from_node( + node=tree_node, + cls=HolderDetection, + ignored_labels=non_holder_labels, + refiner=refine_holder, + ) - if include_holders: - # By default we strip email and urls from holders .... + if not holder: holder = build_detection_from_node( node=tree_node, cls=HolderDetection, - ignored_labels=non_holder_labels, + ignored_labels=non_holder_labels_mini, refiner=refine_holder, ) - if not holder: - # ... but if we have no holder, we try again and - # this time we keep email and URLs for holders using - # "non_holder_labels_mini" as an "ignores" label set - holder = build_detection_from_node( - node=tree_node, - cls=HolderDetection, - ignored_labels=non_holder_labels_mini, - refiner=refine_holder, - ) + if ( + copyrght.copyright + and re.match(r'^Copyright\s+\d{4}$', copyrght.copyright.strip()) + and not holder + ): + continue - if holder: - if TRACE: - logger_debug(f'CopyrightDetector: holders: {holder}') + if include_copyrights: + yield copyrght - yield holder + if include_holders and holder: + yield holder elif include_authors and tree_node_label == 'AUTHOR': + author = build_detection_from_node( node=tree_node, cls=AuthorDetection, @@ -388,10 +391,9 @@ def detect(self, if author: if TRACE: logger_debug(f'CopyrightDetector: detected authors: {author}') - yield author - + def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split): """ Return an iterable of pygmars.Token built from a ``numbered_lines`` iterable @@ -3559,6 +3561,7 @@ def refine_copyright(c): c = strip_suffixes(c, suffixes=COPYRIGHTS_SUFFIXES) c = strip_trailing_period(c) c = c.strip("'") + c = re.sub(r'\b(\d{4})-\$', r'\1', c) return c.strip() @@ -3620,8 +3623,7 @@ def refine_author(a): """ if not a: return - # FIXME: we could consider to split comma separated lists such as - # gthomas, sorin@netappi.com, andrew.lunn@ascom.che.g. + a = remove_some_extra_words_and_punct(a) a = refine_names(a, prefixes=AUTHORS_PREFIXES) a = a.strip() @@ -3633,10 +3635,12 @@ def refine_author(a): a = refine_names(a, prefixes=AUTHORS_PREFIXES) a = a.strip() a = a.strip('+-') + if a and a.lower() not in AUTHORS_JUNK and not a.startswith(AUTHORS_JUNK_PREFIX): return a + def refine_names(s, prefixes): """ Refine a detected ``s`` name string from a author or holder.