import os
import PyPDF2
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
import re

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)

        data = {
            '文件名': [],
            '页码': [],
            '行': [],
            '文本': []
        }

        for page_num in range(num_pages):
            page = reader.pages[page_num]
            page_text = page.extract_text()
            lines = page_text.split('\n')
            
            for line_num, line in enumerate(lines):
                data['文件名'].append(pdf_path)
                data['页码'].append(page_num + 1)
                data['行'].append(line_num + 1)
                data['文本'].append(line)

        return pd.DataFrame(data)

def extract_text_from_pdfs(folder_path):
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            df = extract_text_from_pdf(pdf_path)
            dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

folder_path = 'data-doctor'
df = extract_text_from_pdfs(folder_path)

pd.set_option('display.max_colwidth', None)

# 创建一个文本框用于搜索
search_box = widgets.Text(description='搜索:')
page_text = widgets.BoundedIntText(description='页数', min=1, max=1, step=1, value=1)  # 初始总页数为1
prev_button = widgets.Button(description='◀')
next_button = widgets.Button(description='▶')

output = widgets.Output()

def filter_table(search_value, page):
    filtered_df = df[df['文本'].str.contains(search_value)].copy()
    
    # 更新总页数
    total_pages = len(filtered_df) // 5 + 1
    page_text.max = total_pages
    page_text.value = min(page, total_pages)  # 确保当前页数不超过总页数
    
    # 计算当前页的起始和结束索引
    page_start = (page - 1) * 5
    page_end = page_start + 5
    
    with output:
        output.clear_output()
        filtered_df.文本 = filtered_df.文本.str.replace(search_value, f'<span style="background-color:yellow">{search_value}</span>', regex = True)
        # 使用 Pandas 的 to_html 方法生成带有 HTML 标签的字符串
        html_str = filtered_df.iloc[page_start:page_end].to_html(escape=False)
        display(HTML(html_str))

        
    update_pagination_label()  # 更新总页数的显示

def prev_page(button_event):
    if page_text.value > 1:
        page_text.value -= 1

def next_page(button_event):
    if page_text.value < page_text.max:
        page_text.value += 1

def update_pagination_label():
    pagination_label.value = f'/{page_text.max}'  # 更新总页数的显示

# 监听文本框值和页数值的变化并更新表格
search_box.observe(lambda event: filter_table(search_box.value, page_text.value), names='value')
page_text.observe(lambda event: filter_table(search_box.value, page_text.value), names='value')
prev_button.on_click(prev_page)
next_button.on_click(next_page)

pagination_box = widgets.HBox([prev_button, widgets.Label('页码:'), page_text, widgets.Label(f'/{page_text.max}')])

def update_pagination_box():
    pagination_box.children = [prev_button, widgets.Label('页码:'), page_text, pagination_label, next_button]

update_pagination_box()

display(widgets.VBox([search_box, pagination_box, output]))
Acute kidney injury (AKI) and sepsis carry consensus definitions

sepsis-3 definitions

Diagnosis and definition of S-AKI primarily rely on the KDIGO criteria and

Currently, no universally accepted definition of SA-AKI exists

How can the proposed definition of SA-AKI be operationalized in electronic health records?