import os
import PyPDF2
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
import re
def extract_text_from_pdf(pdf_path):
with open (pdf_path, 'rb' ) as file :
reader = PyPDF2.PdfReader(file )
num_pages = len (reader.pages)
data = {
'文件名' : [],
'页码' : [],
'行' : [],
'文本' : []
}
for page_num in range (num_pages):
page = reader.pages[page_num]
page_text = page.extract_text()
lines = page_text.split(' \n ' )
for line_num, line in enumerate (lines):
data['文件名' ].append(pdf_path)
data['页码' ].append(page_num + 1 )
data['行' ].append(line_num + 1 )
data['文本' ].append(line)
return pd.DataFrame(data)
def extract_text_from_pdfs(folder_path):
dfs = []
for filename in os.listdir(folder_path):
if filename.endswith('.pdf' ):
pdf_path = os.path.join(folder_path, filename)
df = extract_text_from_pdf(pdf_path)
dfs.append(df)
return pd.concat(dfs, ignore_index= True )
folder_path = 'data-doctor'
df = extract_text_from_pdfs(folder_path)
pd.set_option('display.max_colwidth' , None )
# 创建一个文本框用于搜索
search_box = widgets.Text(description= '搜索:' )
page_text = widgets.BoundedIntText(description= '页数' , min = 1 , max = 1 , step= 1 , value= 1 ) # 初始总页数为1
prev_button = widgets.Button(description= '◀' )
next_button = widgets.Button(description= '▶' )
output = widgets.Output()
def filter_table(search_value, page):
filtered_df = df[df['文本' ].str .contains(search_value)].copy()
# 更新总页数
total_pages = len (filtered_df) // 5 + 1
page_text.max = total_pages
page_text.value = min (page, total_pages) # 确保当前页数不超过总页数
# 计算当前页的起始和结束索引
page_start = (page - 1 ) * 5
page_end = page_start + 5
with output:
output.clear_output()
filtered_df.文本 = filtered_df.文本.str .replace(search_value, f'<span style="background-color:yellow"> { search_value} </span>' , regex = True )
# 使用 Pandas 的 to_html 方法生成带有 HTML 标签的字符串
html_str = filtered_df.iloc[page_start:page_end].to_html(escape= False )
display(HTML(html_str))
update_pagination_label() # 更新总页数的显示
def prev_page(button_event):
if page_text.value > 1 :
page_text.value -= 1
def next_page(button_event):
if page_text.value < page_text.max :
page_text.value += 1
def update_pagination_label():
pagination_label.value = f'/ { page_text. max } ' # 更新总页数的显示
# 监听文本框值和页数值的变化并更新表格
search_box.observe(lambda event: filter_table(search_box.value, page_text.value), names= 'value' )
page_text.observe(lambda event: filter_table(search_box.value, page_text.value), names= 'value' )
prev_button.on_click(prev_page)
next_button.on_click(next_page)
pagination_box = widgets.HBox([prev_button, widgets.Label('页码:' ), page_text, widgets.Label(f'/ { page_text. max } ' )])
def update_pagination_box():
pagination_box.children = [prev_button, widgets.Label('页码:' ), page_text, pagination_label, next_button]
update_pagination_box()
display(widgets.VBox([search_box, pagination_box, output]))