`image_extractor`

Pull bitmap images out of PDFs.

pypaperretriever.image_extractor.ImageExtractor

ImageExtractor(pdf_file_path)

Extract figures from a PDF file.

The extractor handles both native PDFs (containing embedded images) and scanned PDFs where each page is an image.

Parameters:

Name	Type	Description	Default
`pdf_file_path`	`str`	Path to the PDF file to process.	required

Attributes:

Name	Type	Description
`filepath`	`str`	Path to the PDF file.
`dir`	`str`	Directory containing the PDF file.
`is_valid_pdf`	`bool`	Whether the file can be opened by PyMuPDF.
`is_native_pdf`	`bool`	`True` if the PDF contains embedded text.
`img_paths`	`list[str]`	Paths to extracted image files.
`img_counter`	`int`	Counter used to name extracted images.
`id`	`str \| None`	Optional identifier prefix for saved images.

Initialize the extractor.

Parameters:

Name	Type	Description	Default
`pdf_file_path`	`str`	Path to the PDF file to process.	required

Source code in pypaperretriever/image_extractor.py

def __init__(self, pdf_file_path):
    """Initialize the extractor.

    Args:
        pdf_file_path (str): Path to the PDF file to process.
    """
    self.filepath = pdf_file_path
    self.dir = os.path.dirname(pdf_file_path)
    self.is_valid_pdf = False
    self.is_native_pdf = False
    self.img_paths = []
    self.img_counter = 0  # Initialize shared image counter
    self.id = None
    self._determine_if_valid_pdf()  # Sometimes PDFs are corrupted and cannot be opened
    if self.is_valid_pdf:
        self._check_pdf_type()
        self._get_metadata()

extract_images

extract_images() -> Self

Extract images from the PDF.

The method determines the PDF type and delegates to the appropriate extraction routine. Extracted image paths are stored in img_paths.

Returns:

Name	Type	Description
`Self`	`Self`	This instance with `img_paths` populated.

Source code in pypaperretriever/image_extractor.py

def extract_images(self) -> Self:
    """Extract images from the PDF.

    The method determines the PDF type and delegates to the appropriate
    extraction routine. Extracted image paths are stored in ``img_paths``.

    Returns:
        Self: This instance with ``img_paths`` populated.

    """
    if not self.is_valid_pdf:
        print("PDF is not valid.")
        return self
    if self.is_native_pdf:
        self.extract_from_native_pdf()
        self.handle_image_based_pdf()
    else:
        self.handle_image_based_pdf()
    return self

extract_from_native_pdf

extract_from_native_pdf() -> None

Extract figures from a native PDF using PyMuPDF.

Saves each valid image to disk and records its file path.

Source code in pypaperretriever/image_extractor.py

def extract_from_native_pdf(self) -> None:
    """Extract figures from a native PDF using PyMuPDF.

    Saves each valid image to disk and records its file path.
    """
    try:
        with pymupdf.open(self.filepath) as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                for img in page.get_images(full=True):
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    color_space = base_image.get("cs")  # Get color space
                    bpc = base_image.get("bpc", 8)     # Get bits per component, default to 8

                    # Use PIL to handle color space conversions
                    image_pil = Image.open(io.BytesIO(image_bytes))

                    # Handle different color spaces
                    if color_space == "DeviceCMYK":
                        image_pil = image_pil.convert("CMYK").convert("RGB")
                    elif color_space == "DeviceGray":
                        image_pil = image_pil.convert("L").convert("RGB")
                    elif color_space == "DeviceRGB":
                        image_pil = image_pil.convert("RGB")
                    else:
                        # Handle other or unknown color spaces if necessary
                        image_pil = image_pil.convert("RGB")

                    # Handle transparency
                    if image_pil.mode in ("RGBA", "LA") or (image_pil.mode == "P" and 'transparency' in image_pil.info):
                        # Create a white background image
                        background = Image.new("RGB", image_pil.size, (255, 255, 255))
                        background.paste(image_pil, mask=image_pil.split()[-1])  # Paste with alpha channel as mask
                        image_pil = background

                    # **Handle BPC Inversion**
                    if bpc == 1 and image_pil.mode == 'L':
                        print(f"Inverting image {self.img_counter} on page {page_num} due to bpc=1 and grayscale mode.")
                        # Invert the image to correct negative appearance
                        image_pil = ImageOps.invert(image_pil)
                        # Optionally, convert back to RGB
                        image_pil = image_pil.convert("RGB")

                    # Convert PIL Image to NumPy array for validation
                    image_np = np.array(image_pil)

                    if self._check_valid_img(image_np):
                        id_prefix = f"id-{self.id}_" if self.id else ""
                        img_filepath = os.path.join(self.dir, "images", f"{id_prefix}img-{self.img_counter}.png")
                        os.makedirs(os.path.dirname(img_filepath), exist_ok=True)

                        # Save the image using PIL to ensure correct color space and handling
                        image_pil.save(img_filepath, "PNG")

                        self._make_json_sidecar(self.img_counter)
                        self.img_paths.append(img_filepath)
                        self.img_counter += 1
    except Exception as e:
        print(f"Error extracting from native PDF: {e}")

handle_image_based_pdf

handle_image_based_pdf() -> None

Process a scanned PDF.

Each page is converted to an image and potential figures are extracted using :meth:_crop_boxes_in_image.

Source code in pypaperretriever/image_extractor.py

def handle_image_based_pdf(self) -> None:
    """Process a scanned PDF.

    Each page is converted to an image and potential figures are extracted
    using :meth:`_crop_boxes_in_image`.
    """
    try:
        pages = convert_from_path(self.filepath, 300)  # DPI set to 300 for good quality

        for page_num, page in enumerate(pages):
            img_filepath = os.path.join(self.dir, "images", f"page_{page_num}.png")
            os.makedirs(os.path.dirname(img_filepath), exist_ok=True)
            page.save(img_filepath, 'PNG')
            self.img_counter = self._crop_boxes_in_image(img_filepath)
            os.remove(img_filepath)

    except Exception as e:
        print(f"Error handling image-based PDF: {e}")