import os
import re
import sqlite3
import hashlib # 添加hashlib库以生成哈希值
from argparse import ArgumentParser
import fitz # PyMuPDF库
def extract_pages_from_pdf(pdf_path, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
pdf = fitz.open(pdf_path)
num_pages = pdf.page_count
file_name = os.path.splitext(os.path.basename(pdf_path))[0]
for page_num in range(num_pages):
page = pdf.load_page(page_num)
output_pdf = f"{output_dir}/{file_name}_page{str(page_num+1).zfill(4)}.pdf"
writer = fitz.open()
writer.insert_pdf(pdf, from_page=page_num, to_page=page_num)
writer.save(output_pdf)
pdf.close()
def process_pdf_to_sqlite(single_page_pdf_dir, db_filename):
conn = sqlite3.connect(db_filename)
c = conn.cursor()
# 创建表结构(如果不存在),并添加id列作为TEXT PRIMARY KEY,存储文件名的hash值
c.execute('''-- 先检查并删除表(如果存在)
DROP TABLE IF EXISTS pdf_pages;''')
c.execute('''
CREATE TABLE pdf_pages (
id TEXT PRIMARY KEY,
file_name TEXT,
page_number INTEGER,
page_content TEXT,
page_binary BLOB)''')
page_count = 0
for filename in os.listdir(single_page_pdf_dir):
if filename.endswith(".pdf"):
full_path = os.path.join(single_page_pdf_dir, filename)
try:
with open(full_path, "rb") as binary_file:
page_binary = binary_file.read()
base_filename = os.path.basename(full_path)
page_num = re.findall("_page0+([0-9]+).pdf$", base_filename.lower())[0]
with fitz.open(full_path) as doc:
text = doc.load_page(0).get_text("text")
# 计算文件名的hash值(这里使用SHA256算法举例)
file_hash = hashlib.sha256(base_filename.encode()).hexdigest()
# 将文件名、页码、内容和二进制数据以及文件名的hash值插入数据库
c.execute("INSERT INTO pdf_pages VALUES (?, ?, ?, ?, ?)", (file_hash, base_filename, page_num, text, sqlite3.Binary(page_binary)))
page_count += 1
except Exception as e:
print(f"处理文件 {filename} 时出错: {e}")
conn.commit()
print(f"最后一个文件的full_path: {full_path}, base_filename: {base_filename}")
conn.close()
print(f'导入{page_count}页内容到SQLite数据库中')
if __name__ == "__main__":
parser = ArgumentParser(description="将指定目录下的所有PDF文件分割为单页,并将每页的内容和二进制数据存至SQLite数据库中")
parser.add_argument("db_filename", help="目标SQLite数据库文件名")
parser.add_argument("pdf_dir", help="包含PDF文件的目录路径")
args = parser.parse_args()
# 首先将每个PDF文件拆分为单页PDF
single_page_pdf_dir = f'{args.pdf_dir}/single_page_pdfs'
for filename in os.listdir(args.pdf_dir):
if filename.endswith(".pdf"):
full_path = os.path.join(args.pdf_dir, filename)
extract_pages_from_pdf(full_path, single_page_pdf_dir)
# 然后将单页PDF的内容和二进制数据存入SQLite数据库
process_pdf_to_sqlite(single_page_pdf_dir, args.db_filename)