Skip to content

reference_retriever

References and cited-by utilities.

pypaperretriever.reference_retriever.ReferenceRetriever

ReferenceRetriever(
    email: str,
    doi: Optional[str] = None,
    pmid: Optional[str] = None,
    standardize: bool = True,
)

Retrieve references and citing articles using a DOI or PMID.

Parameters:

Name Type Description Default
email str

Email address for API access.

required
doi str | None

Digital Object Identifier.

None
pmid str | None

PubMed identifier.

None
standardize bool

If True the output dictionaries share common keys.

True

Initialize the retriever with either DOI or PMID.

Parameters:

Name Type Description Default
email str

Email for API access

required
doi str

Digital Object Identifier

None
pmid str

PubMed ID

None
Source code in pypaperretriever/reference_retriever.py
def __init__(
    self,
    email: str,
    doi: Optional[str] = None,
    pmid: Optional[str] = None,
    standardize: bool = True,
):
    """
    Initialize the retriever with either DOI or PMID.

    Args:
        email (str): Email for API access
        doi (str, optional): Digital Object Identifier
        pmid (str, optional): PubMed ID
    """
    self.email = email
    self.doi = doi
    self.pmid = pmid
    self.standardize = standardize

    print(f"[ReferenceRetriever] Initializing with DOI: {doi} and PMID: {pmid}")

    if self.doi and not self.pmid:
        print(f"[ReferenceRetriever] Converting DOI to PMID for DOI: {self.doi}")
        self.pmid = doi_to_pmid(self.doi, self.email)
        print(f"[ReferenceRetriever] Converted DOI {self.doi} to PMID: {self.pmid}")

fetch_references

fetch_references() -> List[Dict[str, Any]]

Fetch references for the current paper.

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Reference metadata.

Raises:

Type Description
ValueError

If neither DOI nor PMID is provided.

Source code in pypaperretriever/reference_retriever.py
def fetch_references(self) -> List[Dict[str, Any]]:
    """Fetch references for the current paper.

    Returns:
        list[dict[str, Any]]: Reference metadata.

    Raises:
        ValueError: If neither DOI nor PMID is provided.
    """
    print(f"[ReferenceRetriever] Fetching references for DOI: {self.doi}, PMID: {self.pmid}")
    if not self.doi and not self.pmid:
        raise ValueError("Either DOI or PMID must be provided.")

    references = self._find_references()
    if not references:
        print("[ReferenceRetriever] No references found.")
        return []
    print(f"[ReferenceRetriever] Found {len(references)} references.")
    return references

fetch_cited_by

fetch_cited_by() -> List[Dict[str, Any]]

Fetch articles that cite the current paper.

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Citing article metadata.

Raises:

Type Description
ValueError

If PMID conversion fails or is not provided.

Source code in pypaperretriever/reference_retriever.py
def fetch_cited_by(self) -> List[Dict[str, Any]]:
    """Fetch articles that cite the current paper.

    Returns:
        list[dict[str, Any]]: Citing article metadata.

    Raises:
        ValueError: If PMID conversion fails or is not provided.
    """
    print(f"[ReferenceRetriever] Fetching citing articles for DOI: {self.doi}, PMID: {self.pmid}")
    if not self.pmid:
        if self.doi:
            print(f"[ReferenceRetriever] Converting DOI to PMID for DOI: {self.doi}")
            self.pmid = doi_to_pmid(self.doi, self.email)
            print(f"[ReferenceRetriever] Converted DOI {self.doi} to PMID: {self.pmid}")
            if not self.pmid:
                raise ValueError("Unable to convert DOI to PMID.")
        else:
            raise ValueError("PMID must be provided to fetch cited_by articles.")

    cited_by = self._find_cited_by()
    if not cited_by:
        print("[ReferenceRetriever] No citing articles found.")
        return []
    print(f"[ReferenceRetriever] Found {len(cited_by)} citing articles.")
    return cited_by

get_paper_metadata

get_paper_metadata() -> Dict[str, Any]

Fetch metadata for the current paper.

Returns:

Type Description
Dict[str, Any]

dict[str, Any]: Metadata including identifiers, title and authors.

Source code in pypaperretriever/reference_retriever.py
def get_paper_metadata(self) -> Dict[str, Any]:
    """Fetch metadata for the current paper.

    Returns:
        dict[str, Any]: Metadata including identifiers, title and authors.
    """
    print(f"[ReferenceRetriever] Fetching metadata for DOI: {self.doi}, PMID: {self.pmid}")
    if self.pmid:
        print(f"[ReferenceRetriever] Fetching metadata using PMID: {self.pmid}")
        articles = self._fetch_articles_details([self.pmid])
        if articles:
            print(f"[ReferenceRetriever] Metadata found for PMID: {self.pmid}")
            return articles[0]
        else:
            print(f"[ReferenceRetriever] No metadata found for PMID: {self.pmid}")
    elif self.doi:
        print(f"[ReferenceRetriever] Fetching metadata using DOI: {self.doi}")
        pmid = doi_to_pmid(self.doi, self.email)
        if pmid:
            print(f"[ReferenceRetriever] Converted DOI {self.doi} to PMID: {pmid}")
            articles = self._fetch_articles_details([pmid])
            if articles:
                print(f"[ReferenceRetriever] Metadata found for PMID: {pmid}")
                return articles[0]
            else:
                print(f"[ReferenceRetriever] No metadata found for PMID: {pmid}")
        else:
            print(f"[ReferenceRetriever] Unable to convert DOI {self.doi} to PMID.")
    return {}

get_references_europe

get_references_europe(pmid: str) -> List[Dict[str, Any]]

Fetch references from the Europe PMC API.

Parameters:

Name Type Description Default
pmid str

PubMed identifier.

required

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Reference metadata.

Source code in pypaperretriever/reference_retriever.py
def get_references_europe(self, pmid: str) -> List[Dict[str, Any]]:
    """Fetch references from the Europe PMC API.

    Args:
        pmid (str): PubMed identifier.

    Returns:
        list[dict[str, Any]]: Reference metadata.
    """
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/{pmid}/references?page=1&pageSize=1000&format=json"
    print(f"[ReferenceRetriever] Requesting Europe PMC references from URL: {url}")
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            references = data.get('referenceList', {}).get('reference', [])
            print(f"[ReferenceRetriever] Europe PMC returned {len(references)} raw references.")
            return self._parse_europe_references(references)
        else:
            print(f"[ReferenceRetriever] Europe PMC request failed with status code: {response.status_code}")
            return []
    except Exception as e:
        print(f"[ReferenceRetriever] Error fetching references from Europe PMC for PMID {pmid}: {e}")
        return []

get_references_entrez_pubmed

get_references_entrez_pubmed(
    pmid: str,
) -> List[Dict[str, Any]]

Fetch references from PubMed via Entrez.

Parameters:

Name Type Description Default
pmid str

PubMed identifier.

required

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Reference metadata.

Source code in pypaperretriever/reference_retriever.py
def get_references_entrez_pubmed(self, pmid: str) -> List[Dict[str, Any]]:
    """Fetch references from PubMed via Entrez.

    Args:
        pmid (str): PubMed identifier.

    Returns:
        list[dict[str, Any]]: Reference metadata.
    """
    Entrez.email = self.email
    print(f"[ReferenceRetriever] Requesting Entrez PubMed references for PMID: {pmid}")
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        article_details = Entrez.read(handle)
        handle.close()

        references = []
        if 'PubmedArticle' in article_details:
            ref_list = article_details['PubmedArticle'][0].get('PubmedData', {}).get('ReferenceList', [])
            if ref_list:
                references = ref_list[0].get('Reference', [])
                parsed_refs = self._parse_pubmed_references(references)
                print(f"[ReferenceRetriever] Entrez PubMed returned {len(parsed_refs)} parsed references.")
                return parsed_refs
        print("[ReferenceRetriever] Entrez PubMed returned no references.")
        return []
    except Exception as e:
        print(f"[ReferenceRetriever] Error fetching references from PubMed for PMID {pmid}: {e}")
        return []

get_references_crossref

get_references_crossref(doi: str) -> List[Dict[str, Any]]

Fetch references from the CrossRef API.

Parameters:

Name Type Description Default
doi str

DOI of the paper.

required

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Reference metadata.

Source code in pypaperretriever/reference_retriever.py
def get_references_crossref(self, doi: str) -> List[Dict[str, Any]]:
    """Fetch references from the CrossRef API.

    Args:
        doi (str): DOI of the paper.

    Returns:
        list[dict[str, Any]]: Reference metadata.
    """
    url = f"https://api.crossref.org/works/{doi}"
    print(f"[ReferenceRetriever] Requesting CrossRef references from URL: {url}")
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            references = data['message'].get('reference', [])
            print(f"[ReferenceRetriever] CrossRef returned {len(references)} raw references.")
            return self._parse_crossref_references(references)
        else:
            print(f"[ReferenceRetriever] CrossRef request failed with status code: {response.status_code}")
            return []
    except Exception as e:
        print(f"[ReferenceRetriever] Error fetching references from CrossRef for DOI {doi}: {e}")
        return []

get_citing_articles_europe

get_citing_articles_europe(
    pmid: str,
) -> List[Dict[str, Any]]

Fetch citing articles from the Europe PMC API.

Parameters:

Name Type Description Default
pmid str

PubMed identifier.

required

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Citing article metadata.

Source code in pypaperretriever/reference_retriever.py
def get_citing_articles_europe(self, pmid: str) -> List[Dict[str, Any]]:
    """Fetch citing articles from the Europe PMC API.

    Args:
        pmid (str): PubMed identifier.

    Returns:
        list[dict[str, Any]]: Citing article metadata.
    """
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/{pmid}/citations?format=json"
    print(f"[ReferenceRetriever] Requesting Europe PMC citing articles from URL: {url}")
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            citations = data.get('citationList', {}).get('citation', [])
            print(f"[ReferenceRetriever] Europe PMC returned {len(citations)} raw citing articles.")
            return self._parse_europe_cited_by(citations)
        else:
            print(f"[ReferenceRetriever] Europe PMC citing articles request failed with status code: {response.status_code}")
            return []
    except Exception as e:
        print(f"[ReferenceRetriever] Error fetching citing articles from Europe PMC for PMID {pmid}: {e}")
        return []

get_citing_articles_pubmed

get_citing_articles_pubmed(
    pmid: str,
) -> List[Dict[str, Any]]

Fetch citing articles from PubMed via Entrez.

Parameters:

Name Type Description Default
pmid str

PubMed identifier.

required

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Citing article metadata.

Source code in pypaperretriever/reference_retriever.py
def get_citing_articles_pubmed(self, pmid: str) -> List[Dict[str, Any]]:
    """Fetch citing articles from PubMed via Entrez.

    Args:
        pmid (str): PubMed identifier.

    Returns:
        list[dict[str, Any]]: Citing article metadata.
    """
    Entrez.email = self.email
    print(f"[ReferenceRetriever] Requesting PubMed citing articles for PMID: {pmid}")
    try:
        handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_citedin")
        record = Entrez.read(handle)
        handle.close()

        citing_articles = []
        if 'LinkSetDb' in record[0]:
            for linksetdb in record[0]['LinkSetDb']:
                if linksetdb.get('LinkName') == 'pubmed_pubmed_citedin':
                    citing_articles = linksetdb.get('Link', [])
                    break

        pmids = [link['Id'] for link in citing_articles]
        print(f"[ReferenceRetriever] PubMed returned {len(pmids)} citing PMIDs.")
        return self._fetch_articles_details(pmids)
    except Exception as e:
        print(f"[ReferenceRetriever] Error fetching citing articles from PubMed for PMID {pmid}: {e}")
        return []