Skip to content

paper_tracker

Trace a seed paper’s citation network.

pypaperretriever.paper_tracker.PaperTracker

PaperTracker(
    email: str,
    max_upstream_generations: int = 1,
    max_downstream_generations: int = 1,
    doi: Optional[str] = None,
    pmid: Optional[str] = None,
)

Track references and citations for a given paper.

Parameters:

Name Type Description Default
email str

Email address used for API requests.

required
max_upstream_generations int

Depth of reference traversal.

1
max_downstream_generations int

Depth of citation traversal.

1
doi str

DOI of the root paper.

None
pmid str

PMID of the root paper.

None

Attributes:

Name Type Description
df DataFrame

Table describing all tracked papers.

processed_upstream set[str]

Papers already expanded upstream.

processed_downstream set[str]

Papers already expanded downstream.

Initialize the PaperTracker with either a DOI or PMID.

Parameters:

Name Type Description Default
email str

Email for API authentication

required
max_upstream_generations int

Maximum reference depth to track. Defaults to 1.

1
max_downstream_generations int

Maximum citation depth to track. Defaults to 1.

1
doi str

Digital Object Identifier of the paper. Required if pmid not provided.

None
pmid str

PubMed ID of the paper. Required if doi not provided.

None

Raises:

Type Description
ValueError

If neither DOI nor PMID is provided

Source code in pypaperretriever/paper_tracker.py
def __init__(
    self,
    email: str,
    max_upstream_generations: int = 1,
    max_downstream_generations: int = 1,
    doi: Optional[str] = None,
    pmid: Optional[str] = None,
):
    """
    Initialize the PaperTracker with either a DOI or PMID.

    Args:
        email (str): Email for API authentication
        max_upstream_generations (int, optional): Maximum reference depth to track. Defaults to 1.
        max_downstream_generations (int, optional): Maximum citation depth to track. Defaults to 1.
        doi (str, optional): Digital Object Identifier of the paper. Required if pmid not provided.
        pmid (str, optional): PubMed ID of the paper. Required if doi not provided.

    Raises:
        ValueError: If neither DOI nor PMID is provided
    """
    if not doi and not pmid:
        raise ValueError("Either DOI or PMID must be provided.")
    self.email = email
    self.max_upstream_generations = max_upstream_generations
    self.max_downstream_generations = max_downstream_generations
    self.doi = doi
    self.pmid = pmid
    print(f"[PaperTracker] Initializing with DOI: {doi} and PMID: {pmid}")
    self.root_retriever = ReferenceRetriever(email=email, doi=doi, pmid=pmid)
    self.df_columns = [
        'doi', 'pmid', 'title', 'authors', 'year',
        'upstream_generation', 'downstream_generation',
        'children_identifiers', 'parent_identifiers'
    ]
    self.df = pd.DataFrame(columns=self.df_columns)
    self.processed_upstream = set()
    self.processed_downstream = set()

go_upstream

go_upstream(
    doi: Optional[str] = None, pmid: Optional[str] = None
) -> List[Dict[str, Any]]

Fetch references for a paper.

Parameters:

Name Type Description Default
doi str

DOI of the paper.

None
pmid str

PMID of the paper.

None

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Reference metadata.

Source code in pypaperretriever/paper_tracker.py
def go_upstream(self, doi: Optional[str] = None, pmid: Optional[str] = None) -> List[Dict[str, Any]]:
    """Fetch references for a paper.

    Args:
        doi (str, optional): DOI of the paper.
        pmid (str, optional): PMID of the paper.

    Returns:
        list[dict[str, Any]]: Reference metadata.
    """
    print(f"[PaperTracker] Going upstream for DOI: {doi}, PMID: {pmid}")
    retriever = ReferenceRetriever(email=self.email, doi=doi, pmid=pmid)
    return retriever.fetch_references()

go_downstream

go_downstream(
    doi: Optional[str] = None, pmid: Optional[str] = None
) -> List[Dict[str, Any]]

Fetch papers that cite the given paper.

Parameters:

Name Type Description Default
doi str

DOI of the paper.

None
pmid str

PMID of the paper.

None

Returns:

Type Description
List[Dict[str, Any]]

list[dict[str, Any]]: Citing paper metadata.

Source code in pypaperretriever/paper_tracker.py
def go_downstream(self, doi: Optional[str] = None, pmid: Optional[str] = None) -> List[Dict[str, Any]]:
    """Fetch papers that cite the given paper.

    Args:
        doi (str, optional): DOI of the paper.
        pmid (str, optional): PMID of the paper.

    Returns:
        list[dict[str, Any]]: Citing paper metadata.
    """
    print(f"[PaperTracker] Going downstream for DOI: {doi}, PMID: {pmid}")
    retriever = ReferenceRetriever(email=self.email, doi=doi, pmid=pmid)
    return retriever.fetch_cited_by()

track_paper

track_paper() -> pd.DataFrame

Build the citation network around the root paper.

Returns:

Type Description
DataFrame

pandas.DataFrame: Table containing all tracked papers and relationships.

Source code in pypaperretriever/paper_tracker.py
def track_paper(self) -> pd.DataFrame:
    """Build the citation network around the root paper.

    Returns:
        pandas.DataFrame: Table containing all tracked papers and relationships.
    """
    print(f"[PaperTracker] Starting tracking process for DOI: {self.doi}, PMID: {self.pmid}")
    self._track_upstream(self.doi, self.pmid, 0, None)
    self._track_downstream(self.doi, self.pmid, 0, None)
    print("[PaperTracker] Tracking process completed.")
    return self.df