Skip to content

paper_retriever

Download a single paper by DOI/PMID.

pypaperretriever.paper_retriever.PaperRetriever

PaperRetriever(
    email,
    doi=None,
    pmid=None,
    allow_scihub=False,
    download_directory="PDFs",
    filename=None,
    override_previous_attempt=False,
)

Find and download scientific papers.

The class queries several services (Unpaywall, PubMed Central, Crossref and optionally Sci-Hub) to locate a PDF for a given DOI or PMID.

Parameters:

Name Type Description Default
email str

Email address used for API requests.

required
doi str

Digital Object Identifier of the paper.

None
pmid str

PubMed identifier of the paper.

None
allow_scihub bool

Whether to query Sci-Hub as a fallback.

False
download_directory str

Directory where PDFs are stored.

'PDFs'
filename str

Custom filename for the downloaded PDF.

None
override_previous_attempt bool

Overwrite existing downloads.

False

Attributes:

Name Type Description
doi str

DOI encoded for safe file paths.

pmid str | None

PubMed ID of the paper.

pdf_urls list[str]

Candidate URLs pointing to PDF files.

filepath str | None

Path to the downloaded PDF if successful.

is_downloaded bool

True if the PDF has been retrieved.

is_oa bool

True if the paper is open access.

on_scihub bool

True if the PDF was found on Sci-Hub.

Source code in pypaperretriever/paper_retriever.py
def __init__(self, email, doi=None, pmid=None, allow_scihub=False, download_directory='PDFs', filename=None, override_previous_attempt=False):
    self.email = email
    if not doi and not pmid:
        raise ValueError("Either a DOI or PMID must be provided")
    if not doi and pmid:
        doi = pmid_to_doi(pmid, email)
    self.doi = encode_doi(doi)
    self.pmid = pmid
    self.allow_scihub = allow_scihub
    self.is_oa = False
    self.on_scihub = False
    self.pdf_urls = []
    self.is_downloaded = False
    self.filepath = None
    self.override_previous_attempt = override_previous_attempt
    self.download_directory = download_directory
    self.filename = filename
    self.user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Mobile Safari/537.36",
    ]

download

download() -> Self

Find and download the paper.

The method queries multiple services for PDF links and attempts to download the first accessible file. Metadata about the attempt is stored alongside the PDF.

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/paper_retriever.py
def download(self) -> Self:
    """Find and download the paper.

    The method queries multiple services for PDF links and attempts to
    download the first accessible file. Metadata about the attempt is stored
    alongside the PDF.

    Returns:
        Self: This instance.

    """
    if not self.override_previous_attempt:
        self._look_for_previous_download()
        if self.is_downloaded:
            return self

    self.check_open_access()
    self.check_pubmed_central_access()
    self.check_crossref_access(decode_doi(self.doi))
    if len(self.pdf_urls) > 0:
        print("[PyPaperRetriever] Found Open-Access PDF link(s). Attempting download...")
        if self._download_pdf():
            return self
    if self.allow_scihub:
        self.pdf_urls = []
        self.check_scihub_access()
        if len(self.pdf_urls) > 0:
            print("[PyPaperRetriever] Found PDF on Sci-Hub. Attempting download...")
            self._download_pdf()
            if self.is_downloaded:
                return self
        else:
            print(f"[PyPaperRetriever] No PDFs found for {decode_doi(self.doi)}")

    else:
        print(f"[PyPaperRetriever] No Open-Access PDF found for {decode_doi(self.doi)}. Sci-Hub access is disabled.")
    self._download_pdf()  # Just to create JSON sidecar
    return self

check_open_access

check_open_access() -> Self

Check Unpaywall for open-access availability.

Updates pdf_urls with any links returned by the API and sets is_oa if open-access links are found.

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/paper_retriever.py
def check_open_access(self) -> Self:
    """Check Unpaywall for open-access availability.

    Updates ``pdf_urls`` with any links returned by the API and sets
    ``is_oa`` if open-access links are found.

    Returns:
        Self: This instance.

    """

    url = f"https://api.unpaywall.org/v2/{decode_doi(self.doi)}?email={self.email}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        pdf_urls = [None, None, None, None]
        pdf_locations = [loc.get("url_for_pdf") for loc in data.get("oa_locations", []) if loc.get("url_for_pdf")]
        pdf_urls[:len(pdf_locations)] = pdf_locations[:4]
        pdf_urls = [url for url in pdf_urls if url]

        pubmed_europe_info = next((
            (loc.get("url").split("?")[0], loc.get("url").split("pmc")[-1].split("/")[0])
            for loc in data.get("oa_locations", [])
            if "europepmc.org/articles/pmc" in loc.get("url", "")
        ), (None, None))
        pubmed_europe_url, pmcid = pubmed_europe_info # Not used in current implementation

        if len(pdf_urls) > 0:
            self.is_oa = True
            self.pdf_urls += pdf_urls
        return self

    else:
        print("error", f"Unpaywall API request failed with status code {response.status_code}")
        return self

check_pubmed_central_access

check_pubmed_central_access() -> Self

Check whether the article is available in PubMed Central.

Any discovered PDF links are appended to pdf_urls.

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/paper_retriever.py
def check_pubmed_central_access(self) -> Self:
    """Check whether the article is available in PubMed Central.

    Any discovered PDF links are appended to ``pdf_urls``.

    Returns:
        Self: This instance.

    """
    pmc_id = None
    id = self.pmid if self.pmid else doi_to_pmid(decode_doi(self.doi), self.email)
    records = entrez_efetch(self.email, id)
    try:
        id_list = records['PubmedArticle'][0]['PubmedData']['ArticleIdList']
        for element in id_list:
            if element.attributes.get('IdType') == 'pmc':
                pmc_id = str(element)

    except Exception as e:
        print(f"Error processing while checking PMC access for id {id}: {e}")

    if pmc_id is not None:

        article_link = f'https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}/'

        response = requests.get(article_link, headers={"User-Agent": random.choice(self.user_agents)})

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find PDF links
            pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
            pdf_links = [f"{article_link}{pdf_link}" if pdf_link.startswith('/') else pdf_link for pdf_link in pdf_links]
            pdf_links = list(set(pdf_links))
            for link in pdf_links:
                self.pdf_urls.append(link)
        else:
            print(f"Failed to fetch the PubMed Central link. Status code: {response.status_code}")

    return self

check_crossref_access

check_crossref_access(doi: str) -> Self

Query Crossref for PDF links.

Parameters:

Name Type Description Default
doi str

DOI to query.

required

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/paper_retriever.py
def check_crossref_access(self, doi: str) -> Self:
    """Query Crossref for PDF links.

    Args:
        doi (str): DOI to query.

    Returns:
        Self: This instance.

    """
    base_url = "https://api.crossref.org/works/"
    full_url = f"{base_url}{doi}"
    urls = []
    pdf_urls = []

    try:
        response = requests.get(full_url)
        if response.status_code == 200:
            data = response.json()
            primary_url = data.get('message', {}).get('URL', None)
            if primary_url:
                urls.append(primary_url)
            doi_link = f"https://doi.org/{doi}"
            urls.append(doi_link)

        for url in urls:
            try:
                response = requests.get(url, headers={
                    "User-Agent":  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                                "Chrome/58.0.3029.110 Safari/537.3"
                }, timeout=10)  # Added timeout for better error handling

                if response.status_code == 200:
                    final_url = response.url  # The final resolved URL after redirects
                    soup = BeautifulSoup(response.content, 'html.parser')

                    pdf_links = set()

                    # 1. Extract PDF links from <a> tags
                    for a in soup.find_all('a', href=True):
                        href = a['href']
                        if href.lower().endswith('.pdf'):
                            absolute_url = urljoin(final_url, href)
                            pdf_links.add(absolute_url)

                    # 2. Extract PDF links from JavaScript
                    for script in soup.find_all('script'):
                        if script.string:
                            # Regex to find patterns like window.open('/path/to/file.pdf') or href = "/path/to/file.pdf"
                            matches = re.findall(r'''(?:window\.open|href\s*=\s*)\(['"]([^'"]+\.pdf)['"]\)''', script.string, re.IGNORECASE)
                            for match in matches:
                                absolute_url = urljoin(final_url, match)
                                pdf_links.add(absolute_url)

                            # Another regex pattern based on the example provided
                            matches = re.findall(r'''location\s*=\s*['"]([^'"]+\.pdf)['"]''', script.string, re.IGNORECASE)
                            for match in matches:
                                absolute_url = urljoin(final_url, match)
                                pdf_links.add(absolute_url)

                    # 3. Optionally, search for direct links in data attributes or other patterns
                    # Example: data-pdf-url="/path/to/file.pdf"
                    data_pdf_urls = re.findall(r'data-pdf-url=["\']([^"\']+\.pdf)["\']', response.text, re.IGNORECASE)
                    for match in data_pdf_urls:
                        absolute_url = urljoin(final_url, match)
                        pdf_links.add(absolute_url)

                    # Remove any invalid URLs (optional)
                    valid_pdf_links = set()
                    for link in pdf_links:
                        parsed = urlparse(link)
                        if parsed.scheme in ['http', 'https']:
                            valid_pdf_links.add(link)

                    if valid_pdf_links:
                        for link in valid_pdf_links:
                            pdf_urls.append(link)

                else:
                    print(f"Failed to access URL: {url} with status code {response.status_code}")

            except requests.exceptions.RequestException as e:
                print(f"Error accessing URL: {url}")
                print(e)

        final_pdf_urls = list(set(pdf_urls))
        for link in final_pdf_urls:
            self.pdf_urls.append(link)

    except requests.exceptions.RequestException as e:
        print("Something went wrong while trying to access Crossref API")
        print(e)
    return self

check_scihub_access

check_scihub_access() -> Self

Search Sci-Hub mirrors for the paper.

Introduces small delays and rotates user agents to reduce the likelihood of being blocked.

Returns:

Name Type Description
Self Self

This instance.

Source code in pypaperretriever/paper_retriever.py
def check_scihub_access(self) -> Self:
    """Search Sci-Hub mirrors for the paper.

    Introduces small delays and rotates user agents to reduce the likelihood
    of being blocked.

    Returns:
        Self: This instance.

    """
    mirror_list = ["https://sci-hub.st", "https://sci-hub.ru", "https://sci-hub.se"]
    urls = [f"{mirror}/{decode_doi(self.doi)}" for mirror in mirror_list]

    for i, url in enumerate(urls):
        time.sleep(random.randint(1, 3)) # Delay between requests, avoids being blocked
        headers = {
            "User-Agent": random.choice(self.user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.google.com/",
        }
        try:
            r = requests.get(url, headers=headers)
            if r.status_code == 200:
                if len(r.text) < 1:
                    print("""We probably got blocked by Sci-Hub for too many requests. 
                        For being a source of free scientific knowledge, they sure are stingy with their bandwidth.
                        Although they don't specify rate limit and have no robots.txt, they still block IPs with too many requests.
                        Try connecting to a different proxy IP with a VPN.""")
                    break
                result = self._get_pdf_element(r.text, mirror_list[i])
                if result == "unavailable":
                    continue
                elif result:
                    self.on_scihub = True
                    self.pdf_urls.append(result)
                    break
        except requests.RequestException as e:
            print(f"Failed to scrape {url} due to {e}")
            print("If this error includes 'Connection reset by peer', your ISP may be blocking Sci-Hub. Try using a VPN, like ProtonVPN.")
    return self

CLI entry point

pypaperretriever.paper_retriever.main

main() -> None

Run the command-line interface.

The interface accepts DOI or PMID identifiers and downloads the corresponding PDFs using :class:PaperRetriever.

Source code in pypaperretriever/paper_retriever.py
def main() -> None:
    """Run the command-line interface.

    The interface accepts DOI or PMID identifiers and downloads the
    corresponding PDFs using :class:`PaperRetriever`.
    """
    parser = argparse.ArgumentParser(description='Download scientific papers automatically.')
    parser.add_argument('--email', required=True, help='Email address for API usage.')
    parser.add_argument('--doi', help='Digital Object Identifier of the paper.')
    parser.add_argument('--pmid', help='PubMed ID of the paper.')
    parser.add_argument('--dwn-dir', default='PDFs', help='Directory to download the PDFs into. Defaults to "PDFs".')
    parser.add_argument('--filename', help='Custom filename for the downloaded PDF.')
    parser.add_argument('--override', action='store_true', help='Override previous download attempts.')
    parser.add_argument('--allow-scihub', choices=['true', 'false'], default='false',
                    help='Allow downloading from Sci-Hub if available (true/false).')

    args = parser.parse_args()
    args.allow_scihub = args.allow_scihub.lower() == 'true' 

    retriever = PaperRetriever(
        email=args.email,
        doi=args.doi,
        pmid=args.pmid,
        download_directory=args.dwn_dir,
        filename=args.filename,
        override_previous_attempt=args.override,
        allow_scihub=args.allow_scihub,
    )

    retriever.download()