Skip to content

pubmed_searcher

Search PubMed and run a batch pipeline (download, images, refs).

pypaperretriever.pubmed_searcher.PubMedSearcher

PubMedSearcher(search_string=None, df=None, email='')

Search PubMed and manage retrieved articles.

Parameters:

Name Type Description Default
search_string str | None

Query used for PubMed search.

None
df DataFrame | None

Existing table of articles.

None
email str

Email address required by Entrez.

''

Attributes:

Name Type Description
df DataFrame

Table of article metadata and processing flags.

search_string str | None

Stored search query.

email str

Email address used for API calls.

Initialize the searcher.

Parameters:

Name Type Description Default
search_string str | None

Query to submit to PubMed.

None
df DataFrame | None

Existing table of articles.

None
email str

Email address required by Entrez.

''
Source code in pypaperretriever/pubmed_searcher.py
def __init__(self, search_string=None, df=None, email=""):
    """Initialize the searcher.

    Args:
        search_string (str | None): Query to submit to PubMed.
        df (pandas.DataFrame | None): Existing table of articles.
        email (str): Email address required by Entrez.
    """
    self.search_string = search_string
    self.df = df if df is not None else pd.DataFrame()
    if df is not None:
        self._validate_dataframe(df)
    elif search_string is not None:
        self.df = pd.DataFrame()
    if email:
        self.email = email
    else:
        print("Please provide an email address to use for querying PubMed.")
        raise ValueError("Email address is required for PubMed queries.")

search

search(
    count: int = 10,
    min_date: int | None = None,
    max_date: int | None = None,
    order_by: str = "chronological",
    only_open_access: bool = False,
    only_case_reports: bool = False,
) -> Self

Search PubMed for articles.

Parameters:

Name Type Description Default
count int

Number of articles to retrieve.

10
min_date int | None

Minimum publication year.

None
max_date int | None

Maximum publication year.

None
order_by str

"chronological" or "relevance".

'chronological'
only_open_access bool

If True, restrict to open-access articles.

False
only_case_reports bool

If True, restrict to case reports.

False

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/pubmed_searcher.py
def search(
    self,
    count: int = 10,
    min_date: int | None = None,
    max_date: int | None = None,
    order_by: str = "chronological",
    only_open_access: bool = False,
    only_case_reports: bool = False,
) -> Self:
    """Search PubMed for articles.

    Args:
        count (int): Number of articles to retrieve.
        min_date (int | None): Minimum publication year.
        max_date (int | None): Maximum publication year.
        order_by (str): ``"chronological"`` or ``"relevance"``.
        only_open_access (bool): If ``True``, restrict to open-access articles.
        only_case_reports (bool): If ``True``, restrict to case reports.

    Returns:
        Self: This instance.

    """
    if not self.search_string:
        raise ValueError("Search string is not provided")

    additional_filters = []
    if only_open_access:
        # Combine open access filters with an OR condition
        additional_filters.append("(open access[filter] OR free full text[sb])")

    if only_case_reports:
        # Add filter for case reports
        additional_filters.append("case reports[pt]")

    # Join any additional filters to the search string with AND
    if additional_filters:
        self.search_string += " AND " + " AND ".join(additional_filters)

    Entrez.email = self.email
    search_params = {
        'db': "pubmed",
        'term': self.search_string,
        'retmax': count,
        'sort': 'relevance' if order_by == 'relevance' else 'pub date',
    }

    if min_date is not None:
        search_params['mindate'] = str(min_date)
    if max_date is not None:
        search_params['maxdate'] = str(max_date)

    search_handle = Entrez.esearch(**search_params)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    id_list = search_results['IdList']
    fetch_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="xml")
    records_xml_bytes = fetch_handle.read()
    fetch_handle.close()

    records_df = self._parse_records_to_df(records_xml_bytes)
    self.df = pd.concat([self.df, records_df], ignore_index=True)
    return self

download_articles

download_articles(
    allow_scihub: bool = False,
    download_directory: str = "pdf_downloads",
    max_articles: int | None = None,
) -> Self

Download full-text PDFs for articles in df.

Parameters:

Name Type Description Default
allow_scihub bool

Use Sci-Hub as a fallback source.

False
download_directory str

Directory to store downloaded PDFs.

'pdf_downloads'
max_articles int | None

Maximum number of articles to process.

None

Returns:

Name Type Description
Self Self

The updated instance.

Source code in pypaperretriever/pubmed_searcher.py
def download_articles(
    self,
    allow_scihub: bool = False,
    download_directory: str = "pdf_downloads",
    max_articles: int | None = None,
) -> Self:
    """Download full-text PDFs for articles in ``df``.

    Args:
        allow_scihub (bool): Use Sci-Hub as a fallback source.
        download_directory (str): Directory to store downloaded PDFs.
        max_articles (int | None): Maximum number of articles to process.

    Returns:
        Self: The updated instance.

    """
    if self.df.empty:
        print("DataFrame is empty.")
        return self
    if 'pmid' not in self.df.columns or 'doi' not in self.df.columns:
        print("DataFrame is missing required columns for article download (pmid and doi)")
        return self
    if 'download_complete' not in self.df.columns:
        self.df['download_complete'] = 'not_started'
    if 'pdf_filepath' not in self.df.columns:
        self.df['pdf_filepath'] = None
    articles_processed = 0
    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Downloading articles"):
        if row.get('download_complete') == 'complete' or row.get('download_complete') == 'unavailable':
            continue
        if max_articles and articles_processed >= max_articles:
            break
        pmid = row.get('pmid')
        doi = row.get('doi')
        if not doi or len(str(doi)) < 5:
            self.df.at[index, 'download_complete'] = 'unavailable'
            continue
        pdf_filepath = PaperRetriever(pmid=pmid,
                                    doi=doi,
                                    email=self.email,
                                    allow_scihub=allow_scihub,
                                    download_directory=download_directory
                                ).download().filepath
        if pdf_filepath in [None, '', 'unavailable']:
            self.df.at[index, 'download_complete'] = 'unavailable'
            self.df.at[index, 'pdf_filepath'] = None
        else:
            self.df.at[index, 'download_complete'] = 'complete'
            self.df.at[index, 'pdf_filepath'] = pdf_filepath
        articles_processed += 1
        self.save()
    return self

extract_images

extract_images() -> Self

Extract images from downloaded PDFs using :class:ImageExtractor.

Only rows marked as successfully downloaded are processed.

Returns:

Name Type Description
Self Self

The updated instance.

Source code in pypaperretriever/pubmed_searcher.py
def extract_images(self) -> Self:
    """Extract images from downloaded PDFs using :class:`ImageExtractor`.

    Only rows marked as successfully downloaded are processed.

    Returns:
        Self: The updated instance.

    """
    if self.df.empty:
        print("DataFrame is empty. No articles to extract images from.")
        return self

    required_columns = ['download_complete', 'pdf_filepath']
    for col in required_columns:
        if col not in self.df.columns:
            print(f"Error: DataFrame is missing required column '{col}'.")
            return self

    # Initialize columns for image paths and extraction status if they don't exist
    if 'image_paths' not in self.df.columns:
        self.df['image_paths'] = [[] for _ in range(len(self.df))]
    if 'image_extraction_complete' not in self.df.columns:
        self.df['image_extraction_complete'] = 'not_started'

    # Iterate over the DataFrame with a progress bar
    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Extracting Images"):
        # Skip if download was not complete
        if row.get('download_complete') != 'complete':
            continue

        # Skip if image extraction is already complete
        if row.get('image_extraction_complete') == 'complete':
            continue

        pdf_filepath = row.get('pdf_filepath')
        if not pdf_filepath or not os.path.isfile(pdf_filepath):
            self.df.at[index, 'image_extraction_complete'] = 'pdf_not_found'
            continue

        try:             
            # Initialize the ImageExtractor with the PDF file path
            extractor = ImageExtractor(pdf_file_path=pdf_filepath)

            # Extract images
            extractor.extract_images()

            # Retrieve the list of extracted image paths
            extracted_images = extractor.img_paths

            # Update the DataFrame with the extracted image paths
            self.df.at[index, 'image_paths'] = extracted_images
            self.df.at[index, 'image_extraction_complete'] = 'complete' if extracted_images else 'no_images_found'

        except Exception as e:
            print(f"Error extracting images for PMID {row.get('pmid', 'Unknown')}: {e}")
            self.df.at[index, 'image_extraction_complete'] = 'failed'

        # Optionally, save progress after each extraction
        self.save()

    print("Image extraction process completed.")
    return self

fetch_references

fetch_references() -> Self

Fetch references for each article in df.

Returns:

Name Type Description
Self Self

The updated instance.

Source code in pypaperretriever/pubmed_searcher.py
def fetch_references(self) -> Self:
    """Fetch references for each article in ``df``.

    Returns:
        Self: The updated instance.

    """
    if self.df.empty:
        print("DataFrame is empty. No articles to fetch references for.")
        return self

    if 'references' not in self.df.columns:
        self.df['references'] = None

    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Fetching References"):
        if pd.notna(row['references']):  # Skip if references already exist
            continue

        # Initialize ReferenceRetriever with available identifiers
        retriever = ReferenceRetriever(email=self.email, doi=row.get('doi'), pmid=row.get('pmid'), standardize=True)
        references = retriever.fetch_references()

        # Store references or mark as "Not found" if empty
        self.df.at[index, 'references'] = references if references else "Not found"

        # Save progress after each update
        self.save()

    return self

fetch_cited_by

fetch_cited_by() -> Self

Fetch citing articles for each entry in df.

Returns:

Name Type Description
Self Self

The updated instance.

Source code in pypaperretriever/pubmed_searcher.py
def fetch_cited_by(self) -> Self:
    """Fetch citing articles for each entry in ``df``.

    Returns:
        Self: The updated instance.

    """
    if self.df.empty:
        print("DataFrame is empty. No articles to fetch cited-by data for.")
        return self

    if 'cited_by' not in self.df.columns:
        self.df['cited_by'] = None

    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Fetching Cited By"):
        if pd.notna(row['cited_by']):  # Skip if already populated
            continue

        # Initialize ReferenceRetriever with available identifiers
        retriever = ReferenceRetriever(email=self.email, doi=row.get('doi'), pmid=row.get('pmid'))
        cited_by = retriever.fetch_cited_by()  # Now uses both Europe PMC & PubMed

        # Store citing articles or mark as "Not found" if empty
        self.df.at[index, 'cited_by'] = cited_by if cited_by else "Not found"

        # Save progress after each update
        self.save()

    return self

fetch_abstracts

fetch_abstracts()

Retrieve abstracts for articles missing them in df.

Source code in pypaperretriever/pubmed_searcher.py
def fetch_abstracts(self):
    """Retrieve abstracts for articles missing them in ``df``."""
    if not hasattr(self, 'df') or self.df.empty:
        print("DataFrame does not exist or is empty.")
        return

    if 'abstract' not in self.df.columns:
        self.df['abstract'] = None

    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Fetching Abstracts"):
        if pd.notna(row['abstract']):
            continue
        pmid = row.get('pmid')
        if pd.notna(pmid):
            abstract = self.get_abstract(pmid)
            self.df.at[index, 'abstract'] = abstract

        self.save()

get_abstract

get_abstract(pmid: str) -> str

Fetch the abstract for a PMID.

Parameters:

Name Type Description Default
pmid str

Identifier of the article.

required

Returns:

Name Type Description
str str

Abstract text.

Source code in pypaperretriever/pubmed_searcher.py
def get_abstract(self, pmid: str) -> str:
    """Fetch the abstract for a PMID.

    Args:
        pmid (str): Identifier of the article.

    Returns:
        str: Abstract text.
    """
    Entrez.email = self.email
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    article_details = Entrez.read(handle)
    handle.close()
    abstract = article_details['PubmedArticle'][0]['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', '')
    return " ".join(abstract)

download_xml_fulltext

download_xml_fulltext(
    download_directory: str = "downloads",
) -> Self

Download XML full text for open-access articles.

Parameters:

Name Type Description Default
download_directory str

Destination directory for XML files.

'downloads'

Returns:

Name Type Description
Self Self

The updated instance.

Source code in pypaperretriever/pubmed_searcher.py
def download_xml_fulltext(self, download_directory: str = "downloads") -> Self:
    """Download XML full text for open-access articles.

    Args:
        download_directory (str): Destination directory for XML files.

    Returns:
        Self: The updated instance.

    """
    if not hasattr(self, 'df') or self.df.empty:
        print("DataFrame does not exist or is empty.")
        return self

    if 'xml_download_complete' not in self.df.columns:
        self.df['xml_download_complete'] = 'Not started'
    if 'xml_filepath' not in self.df.columns:
        self.df['xml_filepath'] = None

    for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0], desc="Downloading XML full texts"):
        if row.get('xml_download_complete') == 'Complete':
            continue

        custom_download_dir = self._determine_download_directory(row, download_directory, index)
        os.makedirs(custom_download_dir, exist_ok=True)

        last_name = row.get('first_author', '').split(',')[0].strip() if 'first_author' in row else None
        year = str(row.get('publication_year')) if 'publication_year' in row else None
        filename_suffix = f"{last_name}_{year}.xml" if last_name and year else None

        # Attempt to download XML full text
        if row.get('is_oa', False):
            file_path = None
            if pd.notna(row.get('europe_pmc_url')):
                file_path = self.download_article_xml_europe(row.get('pmid'), custom_download_dir, filename_suffix)
            elif pd.notna(row.get('pmcid')):
                file_path = self.download_article_xml_pubmed_oa_subset(row.get('pmcid'), custom_download_dir, filename_suffix)

            if file_path:
                self.df.at[index, 'xml_download_complete'] = 'Complete'
                self.df.at[index, 'xml_filepath'] = file_path
            else:
                self.df.at[index, 'xml_download_complete'] = "Unavailable"
                self.df.at[index, 'xml_filepath'] = None
        else:
            self.df.at[index, 'xml_download_complete'] = "Not OA or no XML available"
            self.df.at[index, 'xml_filepath'] = None

    return self

save

save(csv_path: str = 'master_list.csv') -> Self

Persist the internal DataFrame to CSV.

Parameters:

Name Type Description Default
csv_path str

Output path for the CSV file.

'master_list.csv'

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/pubmed_searcher.py
def save(self, csv_path: str = "master_list.csv") -> Self:
    """Persist the internal DataFrame to CSV.

    Args:
        csv_path (str): Output path for the CSV file.

    Returns:
        Self: This instance.

    """
    self.df.to_csv(csv_path, index=False)
    return self

save_abstracts_as_csv

save_abstracts_as_csv(
    filename: str = "abstracts.csv",
) -> Self

Save only PMIDs and abstracts to a CSV file.

Parameters:

Name Type Description Default
filename str

Output filename.

'abstracts.csv'

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/pubmed_searcher.py
def save_abstracts_as_csv(self, filename: str = "abstracts.csv") -> Self:
    """Save only PMIDs and abstracts to a CSV file.

    Args:
        filename (str): Output filename.

    Returns:
        Self: This instance.

    """
    abstracts_df = self.df[['pmid', 'abstract']].copy()
    abstracts_df.to_csv(filename, index=False)
    return self