From e9fddc930d67e6784459ad40605c52623d673b9a Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Wed, 17 Jun 2026 21:05:28 -0600 Subject: [PATCH 1/6] fix(metadata): encode multi-word Scopus queries Encode main and special Scopus query terms after converting spaces to AND for multi-word keyword handling. --- .../metadata_extractor/fetch_metadata.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index 6ca8f42b..a4f50ca3 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -115,8 +115,15 @@ def _construct_url(self, cursor, year, query, special_query): Returns: str: The constructed URL """ - base = f"{self.base_url}PUBYEAR+%3D+{year}+{query}" - url = base + (f"+{special_query}" if special_query else "") + "&count=200" + encoded_query = urllib.parse.quote(query.replace(" ", " AND ")) + encoded_special_query = ( + urllib.parse.quote(special_query.replace(" ", " AND ")) + if special_query + else "" + ) + + base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" + url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" url += f"&cursor={cursor}" return url From dd605f7a02f306e553b8073bfd007a1c46d0bbe3 Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Thu, 18 Jun 2026 22:21:43 -0600 Subject: [PATCH 2/6] fix(metadata): normalize multi-word keyword handling --- src/comproscanner/comproscanner.py | 14 ++++++++------ .../metadata_extractor/fetch_metadata.py | 13 +++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/comproscanner/comproscanner.py b/src/comproscanner/comproscanner.py index d780e1ed..a2a85c7c 100644 --- a/src/comproscanner/comproscanner.py +++ b/src/comproscanner/comproscanner.py @@ -54,11 +54,13 @@ class ComProScanner: def __init__(self, main_property_keyword: str = None): - self.main_property_keyword = main_property_keyword - if self.main_property_keyword is None: - raise ValueErrorHandler( - "Please provide a main property keyword to proceed." - ) + if main_property_keyword is None: + raise ValueErrorHandler( + "Please provide a main property keyword to proceed." + ) + + self.main_property_keyword = main_property_keyword.replace(" ", "_") + self.main_property_search_keyword = self.main_property_keyword.replace("_", " ") def collect_metadata( self, @@ -501,7 +503,7 @@ def extract_composition_property_data( f"results/extracted_data/{self.main_property_keyword}/related_figures" ) if materials_data_identifier_query is None: - materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_keyword} value mentioned in the paper? Give one word answer. Either yes or no." + materials_data_identifier_query = f"Is there any material chemical composition and corresponding {self.main_property_search_keyword} value mentioned in the paper? Give one word answer. Either yes or no." preparator = MatPropDataPreparator( main_property_keyword=self.main_property_keyword, main_extraction_keyword=main_extraction_keyword, diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index a4f50ca3..2248ba44 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -115,12 +115,13 @@ def _construct_url(self, cursor, year, query, special_query): Returns: str: The constructed URL """ - encoded_query = urllib.parse.quote(query.replace(" ", " AND ")) - encoded_special_query = ( - urllib.parse.quote(special_query.replace(" ", " AND ")) - if special_query - else "" - ) + search_query = query.replace("_", " ") +encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) +encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" +) base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" From 6c2e4acf68f2da0788b6c27026884fbec17e69c1 Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Mon, 22 Jun 2026 20:43:45 -0600 Subject: [PATCH 3/6] fix: correct indentation in keyword handling changes --- src/comproscanner/comproscanner.py | 14 +++++++------- .../metadata_extractor/fetch_metadata.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/comproscanner/comproscanner.py b/src/comproscanner/comproscanner.py index a2a85c7c..38d321de 100644 --- a/src/comproscanner/comproscanner.py +++ b/src/comproscanner/comproscanner.py @@ -53,14 +53,14 @@ class ComProScanner: - def __init__(self, main_property_keyword: str = None): - if main_property_keyword is None: - raise ValueErrorHandler( - "Please provide a main property keyword to proceed." - ) + def __init__(self, main_property_keyword: str = None): + if main_property_keyword is None: + raise ValueErrorHandler( + "Please provide a main property keyword to proceed." + ) - self.main_property_keyword = main_property_keyword.replace(" ", "_") - self.main_property_search_keyword = self.main_property_keyword.replace("_", " ") + self.main_property_keyword = main_property_keyword.replace(" ", "_") + self.main_property_search_keyword = self.main_property_keyword.replace("_", " ") def collect_metadata( self, diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index 2248ba44..2171f23b 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -102,7 +102,7 @@ def __init__( } self.is_exceeded = False - def _construct_url(self, cursor, year, query, special_query): + def _construct_url(self, cursor, year, query, special_query): """ Construct the URL for the request with cursor-based pagination @@ -116,12 +116,12 @@ def _construct_url(self, cursor, year, query, special_query): str: The constructed URL """ search_query = query.replace("_", " ") -encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) -encoded_special_query = ( - urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) - if special_query - else "" -) + encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) + encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" + ) base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" From 96ce3fba69b4b10d834506059718fa5d433ed66d Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Tue, 23 Jun 2026 06:24:46 -0600 Subject: [PATCH 4/6] fix: correct ComProScanner init indentation --- src/comproscanner/comproscanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comproscanner/comproscanner.py b/src/comproscanner/comproscanner.py index 38d321de..17e9076d 100644 --- a/src/comproscanner/comproscanner.py +++ b/src/comproscanner/comproscanner.py @@ -53,7 +53,7 @@ class ComProScanner: - def __init__(self, main_property_keyword: str = None): + def __init__(self, main_property_keyword: str = None): if main_property_keyword is None: raise ValueErrorHandler( "Please provide a main property keyword to proceed." From 2fbe7f806162003ce3cc26ef1af79c7975805668 Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Tue, 23 Jun 2026 06:42:59 -0600 Subject: [PATCH 5/6] fix: correct metadata URL construction indentation --- .../metadata_extractor/fetch_metadata.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index 2171f23b..b4ba7eb1 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -103,30 +103,30 @@ def __init__( self.is_exceeded = False def _construct_url(self, cursor, year, query, special_query): - """ - Construct the URL for the request with cursor-based pagination - - Args: - cursor (str): The cursor value ('*' for first request, or next cursor from previous response) - year (int): The year for the request - query (str): The query for the request - special_query (str): The special query for the request + """ + Construct the URL for the request with cursor-based pagination + + Args: + cursor (str): The cursor value ('*' for first request, or next cursor from previous response) + year (int): The year for the request + query (str): The query for the request + special_query (str): The special query for the request + + Returns: + str: The constructed URL + """ + search_query = query.replace("_", " ") + encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) + encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" + ) - Returns: - str: The constructed URL - """ - search_query = query.replace("_", " ") - encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) - encoded_special_query = ( - urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) - if special_query - else "" - ) - - base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" - url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" - url += f"&cursor={cursor}" - return url + base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" + url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" + url += f"&cursor={cursor}" + return url def _send_request(self, url): """ From 7fb6d9c9e480d52554e1009c812575240e0673f3 Mon Sep 17 00:00:00 2001 From: WilmerGaspar Date: Tue, 23 Jun 2026 21:01:23 -0600 Subject: [PATCH 6/6] fix: move construct url back to FetchMetadata class --- .../metadata_extractor/fetch_metadata.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/comproscanner/metadata_extractor/fetch_metadata.py b/src/comproscanner/metadata_extractor/fetch_metadata.py index b4ba7eb1..a13b70d1 100644 --- a/src/comproscanner/metadata_extractor/fetch_metadata.py +++ b/src/comproscanner/metadata_extractor/fetch_metadata.py @@ -102,31 +102,31 @@ def __init__( } self.is_exceeded = False - def _construct_url(self, cursor, year, query, special_query): - """ - Construct the URL for the request with cursor-based pagination - - Args: - cursor (str): The cursor value ('*' for first request, or next cursor from previous response) - year (int): The year for the request - query (str): The query for the request - special_query (str): The special query for the request - - Returns: - str: The constructed URL - """ - search_query = query.replace("_", " ") - encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) - encoded_special_query = ( - urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) - if special_query - else "" - ) + def _construct_url(self, cursor, year, query, special_query): + """ + Construct the URL for the request with cursor-based pagination - base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" - url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" - url += f"&cursor={cursor}" - return url + Args: + cursor (str): The cursor value ('*' for first request, or next cursor from previous response) + year (int): The year for the request + query (str): The query for the request + special_query (str): The special query for the request + + Returns: + str: The constructed URL + """ + search_query = query.replace("_", " ") + encoded_query = urllib.parse.quote(search_query.replace(" ", " AND ")) + encoded_special_query = ( + urllib.parse.quote(special_query.replace("_", " ").replace(" ", " AND ")) + if special_query + else "" + ) + + base = f"{self.base_url}PUBYEAR+%3D+{year}+{encoded_query}" + url = base + (f"+{encoded_special_query}" if encoded_special_query else "") + "&count=200" + url += f"&cursor={cursor}" + return url def _send_request(self, url): """