Member-only story

PDF Manipulation using Python — fitz Library

Python Coding
2 min readAug 31, 2024

--

pip install PyMuPDF

1. Extract Text from a PDF

import fitz

def extract_text(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text

pdf_path = "clcoding.pdf"
text = extract_text(pdf_path)
print(text)
Hello World!

2. Extract Images from a PDF

import fitz
import os

def extract_images(pdf_path, output_dir):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for img in page.get_images(full=True):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = os.path.join(output_dir,
f"image_{page_num+1}_{xref}.{image_ext}")
with open(image_filename, "wb") as image_file:
image_file.write(image_bytes)

pdf_path = "clcoding.pdf"
output_dir = "images"
os.makedirs(output_dir, exist_ok=True)
extract_images(pdf_path, output_dir)

--

--

Python Coding
Python Coding

Written by Python Coding

Learn python tips and tricks with code I Share your knowledge with us to help society. Python Quiz: https://www.clcoding.com/p/quiz-questions.html

No responses yet