Member-only story
PDF Manipulation using Python — fitz Library
2 min readAug 31, 2024
pip install PyMuPDF
1. Extract Text from a PDF
import fitz
def extract_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
pdf_path = "clcoding.pdf"
text = extract_text(pdf_path)
print(text)
Hello World!
2. Extract Images from a PDF
import fitz
import os
def extract_images(pdf_path, output_dir):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for img in page.get_images(full=True):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = os.path.join(output_dir,
f"image_{page_num+1}_{xref}.{image_ext}")
with open(image_filename, "wb") as image_file:
image_file.write(image_bytes)
pdf_path = "clcoding.pdf"
output_dir = "images"
os.makedirs(output_dir, exist_ok=True)
extract_images(pdf_path, output_dir)