Extract figures from a PDF file.
The extractor handles both native PDFs (containing embedded images) and
scanned PDFs where each page is an image.
Parameters:
| Name |
Type |
Description |
Default |
pdf_file_path
|
str
|
Path to the PDF file to process.
|
required
|
Attributes:
| Name |
Type |
Description |
filepath |
str
|
|
dir |
str
|
Directory containing the PDF file.
|
is_valid_pdf |
bool
|
Whether the file can be opened by PyMuPDF.
|
is_native_pdf |
bool
|
True if the PDF contains embedded text.
|
img_paths |
list[str]
|
Paths to extracted image files.
|
img_counter |
int
|
Counter used to name extracted images.
|
id |
str | None
|
Optional identifier prefix for saved images.
|
Initialize the extractor.
Parameters:
| Name |
Type |
Description |
Default |
pdf_file_path
|
str
|
Path to the PDF file to process.
|
required
|
Source code in pypaperretriever/image_extractor.py
| def __init__(self, pdf_file_path):
"""Initialize the extractor.
Args:
pdf_file_path (str): Path to the PDF file to process.
"""
self.filepath = pdf_file_path
self.dir = os.path.dirname(pdf_file_path)
self.is_valid_pdf = False
self.is_native_pdf = False
self.img_paths = []
self.img_counter = 0 # Initialize shared image counter
self.id = None
self._determine_if_valid_pdf() # Sometimes PDFs are corrupted and cannot be opened
if self.is_valid_pdf:
self._check_pdf_type()
self._get_metadata()
|
Extract images from the PDF.
The method determines the PDF type and delegates to the appropriate
extraction routine. Extracted image paths are stored in img_paths.
Returns:
| Name | Type |
Description |
Self |
Self
|
This instance with img_paths populated.
|
Source code in pypaperretriever/image_extractor.py
| def extract_images(self) -> Self:
"""Extract images from the PDF.
The method determines the PDF type and delegates to the appropriate
extraction routine. Extracted image paths are stored in ``img_paths``.
Returns:
Self: This instance with ``img_paths`` populated.
"""
if not self.is_valid_pdf:
print("PDF is not valid.")
return self
if self.is_native_pdf:
self.extract_from_native_pdf()
self.handle_image_based_pdf()
else:
self.handle_image_based_pdf()
return self
|
extract_from_native_pdf() -> None
Extract figures from a native PDF using PyMuPDF.
Saves each valid image to disk and records its file path.
Source code in pypaperretriever/image_extractor.py
| def extract_from_native_pdf(self) -> None:
"""Extract figures from a native PDF using PyMuPDF.
Saves each valid image to disk and records its file path.
"""
try:
with pymupdf.open(self.filepath) as doc:
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for img in page.get_images(full=True):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
color_space = base_image.get("cs") # Get color space
bpc = base_image.get("bpc", 8) # Get bits per component, default to 8
# Use PIL to handle color space conversions
image_pil = Image.open(io.BytesIO(image_bytes))
# Handle different color spaces
if color_space == "DeviceCMYK":
image_pil = image_pil.convert("CMYK").convert("RGB")
elif color_space == "DeviceGray":
image_pil = image_pil.convert("L").convert("RGB")
elif color_space == "DeviceRGB":
image_pil = image_pil.convert("RGB")
else:
# Handle other or unknown color spaces if necessary
image_pil = image_pil.convert("RGB")
# Handle transparency
if image_pil.mode in ("RGBA", "LA") or (image_pil.mode == "P" and 'transparency' in image_pil.info):
# Create a white background image
background = Image.new("RGB", image_pil.size, (255, 255, 255))
background.paste(image_pil, mask=image_pil.split()[-1]) # Paste with alpha channel as mask
image_pil = background
# **Handle BPC Inversion**
if bpc == 1 and image_pil.mode == 'L':
print(f"Inverting image {self.img_counter} on page {page_num} due to bpc=1 and grayscale mode.")
# Invert the image to correct negative appearance
image_pil = ImageOps.invert(image_pil)
# Optionally, convert back to RGB
image_pil = image_pil.convert("RGB")
# Convert PIL Image to NumPy array for validation
image_np = np.array(image_pil)
if self._check_valid_img(image_np):
id_prefix = f"id-{self.id}_" if self.id else ""
img_filepath = os.path.join(self.dir, "images", f"{id_prefix}img-{self.img_counter}.png")
os.makedirs(os.path.dirname(img_filepath), exist_ok=True)
# Save the image using PIL to ensure correct color space and handling
image_pil.save(img_filepath, "PNG")
self._make_json_sidecar(self.img_counter)
self.img_paths.append(img_filepath)
self.img_counter += 1
except Exception as e:
print(f"Error extracting from native PDF: {e}")
|
handle_image_based_pdf
handle_image_based_pdf() -> None
Process a scanned PDF.
Each page is converted to an image and potential figures are extracted
using :meth:_crop_boxes_in_image.
Source code in pypaperretriever/image_extractor.py
| def handle_image_based_pdf(self) -> None:
"""Process a scanned PDF.
Each page is converted to an image and potential figures are extracted
using :meth:`_crop_boxes_in_image`.
"""
try:
pages = convert_from_path(self.filepath, 300) # DPI set to 300 for good quality
for page_num, page in enumerate(pages):
img_filepath = os.path.join(self.dir, "images", f"page_{page_num}.png")
os.makedirs(os.path.dirname(img_filepath), exist_ok=True)
page.save(img_filepath, 'PNG')
self.img_counter = self._crop_boxes_in_image(img_filepath)
os.remove(img_filepath)
except Exception as e:
print(f"Error handling image-based PDF: {e}")
|