From a252eb16e947131356162618737f56581548d046 Mon Sep 17 00:00:00 2001 From: andrei Date: Mon, 23 Feb 2026 17:11:15 +0100 Subject: [PATCH 1/4] feat: a new filed that contains merged links --- server/workers/base/src/base.py | 1 + server/workers/common/common/enrichment.py | 6 ++++-- server/workers/orcid/src/orcid_service.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py index 320af9689..12fbd6626 100644 --- a/server/workers/base/src/base.py +++ b/server/workers/base/src/base.py @@ -251,6 +251,7 @@ def filter_duplicates(df, service, params): df["doi_duplicate"] = False df["has_relations"] = False df["link_duplicate"] = False + df["pdf_link_candidates_from_duplicates"] = "" df["duplicates"] = df.apply( lambda x: ",".join([x["id"], x["duplicates"]]) if len(x["duplicates"].split(",")) >= 1 diff --git a/server/workers/common/common/enrichment.py b/server/workers/common/common/enrichment.py index df58da470..fd11ef34e 100644 --- a/server/workers/common/common/enrichment.py +++ b/server/workers/common/common/enrichment.py @@ -261,7 +261,9 @@ def apply_oa_state_improvements(df, anchor_idx, accumulator): def apply_link_improvements(df, anchor_idx, all_links): """ - Applies improvements for link to the anchor element. + Applies improvements for link to the anchor element: set in + pdf_link_candidates_from_duplicates column if there are any links + from duplicates that can be used for PDF lookup. Args: df: DataFrame with data @@ -272,4 +274,4 @@ def apply_link_improvements(df, anchor_idx, all_links): unique_links = deduplicate_links(all_links) if unique_links: merged_links = '; '.join(sorted(unique_links)) - df.loc[anchor_idx, 'link'] = merged_links + df.loc[anchor_idx, 'pdf_link_candidates_from_duplicates'] = merged_links diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index 0e32e2f60..e138ee274 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -400,7 +400,7 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra 'relations', 'annotations', 'repo', 'source', 'volume', 'issue', 'page', 'issn', 'citation_count', 'cited_by_wikipedia_count', 'cited_by_msm_count', 'cited_by_policies_count', 'cited_by_patents_count', 'cited_by_accounts_count', 'cited_by_fbwalls_count', - 'merged_dois', + 'merged_dois', 'pdf_link_candidates_from_duplicates', 'cited_by_feeds_count', 'cited_by_gplus_count', 'cited_by_rdts_count', From 3af393c55f80f0ba525c1dbe656d316fd80c0822 Mon Sep 17 00:00:00 2001 From: andrei Date: Mon, 23 Feb 2026 17:29:56 +0100 Subject: [PATCH 2/4] feat: remove anchor link from the array with links from doubles --- server/workers/common/common/enrichment.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/server/workers/common/common/enrichment.py b/server/workers/common/common/enrichment.py index fd11ef34e..79fa1931e 100644 --- a/server/workers/common/common/enrichment.py +++ b/server/workers/common/common/enrichment.py @@ -273,5 +273,20 @@ def apply_link_improvements(df, anchor_idx, all_links): if all_links: unique_links = deduplicate_links(all_links) if unique_links: - merged_links = '; '.join(sorted(unique_links)) + anchor_link = get_anchor_field_value(df, anchor_idx, 'link') + unique_links_without_anchor_link = [x for x in unique_links if x != anchor_link] + + merged_links = '; '.join(sorted(unique_links_without_anchor_link)) df.loc[anchor_idx, 'pdf_link_candidates_from_duplicates'] = merged_links + +def get_anchor_field_value(df, anchor_idx, column_name): + """ + Returns the value of the given column for the anchor row, or None if + the column is missing or the value is empty/NaN. + """ + if column_name not in df.columns: + return None + value = df.loc[anchor_idx, column_name] + if pd.isna(value) or value == '': + return None + return value From 3cc946e00c1abc257c296102d117e1ada567c4fe Mon Sep 17 00:00:00 2001 From: andrei Date: Tue, 24 Feb 2026 09:59:52 +0100 Subject: [PATCH 3/4] feat: parsing of pdf link candidates --- vis/js/dataprocessing/managers/DataManager.ts | 4 ++++ vis/js/types/models/paper.ts | 2 ++ vis/js/utils/data.ts | 19 +++++++++++++++++++ vis/test/data/papers.ts | 2 ++ 4 files changed, 27 insertions(+) diff --git a/vis/js/dataprocessing/managers/DataManager.ts b/vis/js/dataprocessing/managers/DataManager.ts index f5d7eced6..436bd1e65 100644 --- a/vis/js/dataprocessing/managers/DataManager.ts +++ b/vis/js/dataprocessing/managers/DataManager.ts @@ -14,6 +14,7 @@ import { getListLink, getOpenAccessLink, getOutlink, + getPdfLinkCandidatesFromDuplicates, getValueOrZero, getVisibleMetric, isOpenAccess, @@ -257,6 +258,9 @@ class DataManager { paper.oa_link = getOpenAccessLink(paper, this.config); paper.outlink = getOutlink(paper, this.config); paper.list_link = getListLink(paper, this.config, this.context); + + paper.pdf_link_candidates_from_duplicates = + getPdfLinkCandidatesFromDuplicates(paper); } __parseComments(paper: any) { diff --git a/vis/js/types/models/paper.ts b/vis/js/types/models/paper.ts index a87a440c6..cf43198ba 100644 --- a/vis/js/types/models/paper.ts +++ b/vis/js/types/models/paper.ts @@ -68,6 +68,8 @@ export interface CommonPaperDataForAllIntegrations { zoomedY: number; zoomedWidth: number; zoomedHeight: number; + + pdf_link_candidates_from_duplicates: string[] | null; } export interface PubmedPaper extends CommonPaperDataForAllIntegrations { diff --git a/vis/js/utils/data.ts b/vis/js/utils/data.ts index 76585c686..3f16568ab 100644 --- a/vis/js/utils/data.ts +++ b/vis/js/utils/data.ts @@ -285,6 +285,25 @@ export const getListLink = (paper, config, context) => { return {}; }; +/** + * Parses the paper's pdf link candidates from duplicates into an array of strings. + * + * @param {object} paper paper object + * + * @returns array of strings or null if no candidates are found + */ +export const getPdfLinkCandidatesFromDuplicates = (paper): string[] | null => { + if ( + typeof paper.pdf_link_candidates_from_duplicates !== "string" || + !paper.pdf_link_candidates_from_duplicates + ) { + return null; + } + + const links = paper.pdf_link_candidates_from_duplicates.split(";"); + return links.length > 0 ? links : null; +}; + /** * Parses the paper's authors string into an object array. * diff --git a/vis/test/data/papers.ts b/vis/test/data/papers.ts index 32f6f6c71..496602362 100644 --- a/vis/test/data/papers.ts +++ b/vis/test/data/papers.ts @@ -55,6 +55,8 @@ const MOCK_COMMON_PAPER_DATA: CommonPaperDataForAllIntegrations = { zoomedY: 1, zoomedWidth: 1, zoomedHeight: 1, + + pdf_link_candidates_from_duplicates: null, }; export const MOCK_BASE_PAPER_DATA: BasePaper = { From 2b22d6954fef82c5a7a831733ee6975ac4743a2b Mon Sep 17 00:00:00 2001 From: andrei Date: Tue, 24 Feb 2026 10:20:09 +0100 Subject: [PATCH 4/4] feat: usage of the pdf link candidates for pdf loading --- vis/js/utils/usePdfLookup.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vis/js/utils/usePdfLookup.ts b/vis/js/utils/usePdfLookup.ts index 02537fe2d..897b83dd8 100644 --- a/vis/js/utils/usePdfLookup.ts +++ b/vis/js/utils/usePdfLookup.ts @@ -50,6 +50,12 @@ const usePdfLookup = (paper: Paper, serverUrl: string, service: string) => { let possiblePDFs = ""; let fallbackUrl = ""; if (service === "base") { + let pdfLinkCandidatesFromDuplicates = null; + + if ("pdf_link_candidates_from_duplicates" in paper) { + pdfLinkCandidatesFromDuplicates = paper.pdf_link_candidates_from_duplicates as string[] | null; + } + possiblePDFs = encodeURIComponent(paper.link) + ";" + @@ -59,6 +65,10 @@ const usePdfLookup = (paper: Paper, serverUrl: string, service: string) => { .split("; ") .map((x) => encodeURIComponent(x)) .join("; "); + + if (pdfLinkCandidatesFromDuplicates) { + possiblePDFs += ";" + pdfLinkCandidatesFromDuplicates.map((x: string) => encodeURIComponent(x)).join("; "); + } } if (service === "openaire") {