55 脚本

55.1 脚本传参数

# 脚本文件名: myscript.py
import sys

# 获取命令行参数
arguments = sys.argv

# 第一个参数是脚本的名称，因此实际的参数从第二个开始
if len(arguments) < 2:
    print("Usage: python myscript.py <arg1> <arg2> ...")
    sys.exit(1)

# 打印传递的参数
print("Number of arguments:", len(arguments))
print("Argument List:", str(arguments))

# 从第二个参数开始遍历
for arg in arguments[1:]:
    print("Argument:", arg)

55.2 执行方法

55.2.1 终端里执行

python myscript.py arg1 arg2 arg3

55.2.2 python代码执行

import subprocess

# 定义要执行的命令及其参数
command = ['python', 'myscript.py', 'arg1', 'arg2', 'arg3']


# 执行命令
result = subprocess.run(command, stdout=subprocess.PIPE)

# 检查命令是否成功执行
if result.returncode == 0:
    print("Command executed successfully!")
    # 尝试使用不同的编码方式解码输出结果
    try:
        output = result.stdout.decode('utf-8')
    except UnicodeDecodeError:
        output = result.stdout.decode('gbk')  # 尝试使用 gbk 编码解码
    print(output)
else:
    print("Command failed with return code:", result.returncode)

55.3 实例

55.3.1 临床试验注册

# 脚本文件名: project_registry_check.py

import sys
import os
import math
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd
from bs4 import BeautifulSoup
import yagmail
from myconfig import mail_user, mail_password


# 设置 Chrome 驱动路径
chrome_driver_path = '/usr/lib/chromium-browser/chromedriver'

# 设置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument('--headless')  # 无头模式，不显示浏览器界面
chrome_options.add_argument('--no-sandbox')  #加上这个参数才正常启动

# 创建 Chrome WebDriver 实例
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# 获取命令行参数
arguments = sys.argv

affiliation = arguments[1]

## 获取网页html内容
def get_page_content(url, driver=driver):
    # 请求网页
    driver.get(url)

    # 获取完整页面内容
    page_content = driver.page_source

    if page_content:
        print(f"获取页面内容...")
        
    return page_content


## 提取html信息

def extract_info_from_page_content(page_content, driver = driver):
    ### 获取一般信息
    print(f"获取临床研究一般信息...")

    
    general = pd.read_html(page_content)[0].drop(columns = '历史版本').drop(columns = "注册题目")
    
    
    ### 获取链接
    print(f"获取临床研究链接...")

    
    soup = BeautifulSoup(page_content, features="lxml")
    table = soup.find('table')
    links = []
    registration_nos = []
    titles = []
    
    
    for tr in table.find_all('tr'):
        # print(tr)
        tds = tr.find_all('td')
        if tds:
            # print(tds)
            registration_no = tds[1].text
            url = f"https://www.chictr.org.cn/{tds[2].find('a')['href']}"
            title = tds[2].find('a')['title']
            
            links.append(url)
            registration_nos.append(registration_no)
            titles.append(title)
    
    
    link_df = pd.DataFrame({'链接': links,
                 '注册号':registration_nos,
                           '注册题目':titles})
   

    


    ### 获取项目负责人信息
    print(f"获取临床研究负责人...")

    
    registration_nos = []
    directors = []
    for url in link_df.链接.values:
        driver.get(url)
        # 获取完整页面内容
        page_content = driver.page_source
        df1 = pd.read_html(page_content)[0]
        registration_no = df1.iloc[0,1]
        df2 = pd.read_html(page_content)[1]
        director = df2.iloc[0, 3]
    
        registration_nos.append(registration_no)
        directors.append(director)
    
    director_df = pd.DataFrame({'项目负责人': directors,
                 '注册号':registration_nos})

    # 将三个表通过registration_no拼接
    merge = general.merge(link_df).merge(director_df)
    
    return merge

## 发送邮件
def send_info(send_info_html):
    yag = yagmail.SMTP(user=mail_user, password= mail_password, host='smtp.163.com')

    # 发送邮件

    # 多个邮箱地址
    recipient_list = ['hulinhui@live.cn', 'mmkejiaoke@163.com', 'hym202008@163.com']

    yag.send(to=recipient_list, 
             subject=f'提醒：{affiliation}有新的临床试验注册，需确认项目负责人有无及时备案', 
             contents = send_info_html)

    # 关闭SMTP连接
    yag.close()


def extract_page_count(page_content):
    '''
    获取查询结果页数
    '''
    soup = BeautifulSoup(page_content, features="lxml")
    item_count = soup.find('span', id ='data-total').text

    return math.ceil(int(item_count)/10)
    


registration_table = []

# 获取页数
print(f"获取查询结果的页数...")


url = f'https://www.chictr.org.cn/searchproj.html?sponsor={affiliation}'
page_content = get_page_content(url)
page_count = extract_page_count(page_content) 

print(f"获取查询结果的页数为：{page_count}。")

if page_count != 0: 
    # 处理第1页
    print(f"处理第1页列表")
    
    registration_info = extract_info_from_page_content(page_content)
    registration_table.append(registration_info)


    # 由于新的内容总是在第1页，且注册内容不会很多，故在全部取齐注册项目后，只要取第1页的内容就行
    data_file = f'data/{affiliation}临床试验注意清单.csv'
    if not os.path.exists(data_file):
        
        if page_count >= 2: 
            for page_num in range(1, page_count):
                print(f"处理第{page_num+1}页列表")
                url = f'https://www.chictr.org.cn/searchproj.html?sponsor={affiliation}&page={page_num+1}'
            
                page_content = get_page_content(url)
                registration_info = extract_info_from_page_content(page_content)
                registration_table.append(registration_info)
    else:
        pass

    # 关闭浏览器
    driver.quit()
    
    
    ## 合并与保存
    df = pd.concat(registration_table, ignore_index = True)
    
    
    ## 判断有无新注册项目
    
    if os.path.exists(data_file):
        old_registration_no = pd.read_csv(data_file)['注册号']
    else:
        old_registration_no = []
    
    new_registration_no = df['注册号']
    
    diff_registration_no = set(new_registration_no) - set(old_registration_no)
    
    
    # 如果存在新注册，则将信息导出为html文本，通过邮箱发送
    if diff_registration_no:
        print(f"{affiliation}有新注册临床研究，已发送至邮箱。")
        send_info_html = df[df['注册号'].isin(diff_registration_no)].to_html()
        send_info(send_info_html)
    else:
        print(f"{affiliation}无新注册临床研究")
    
    # 保存数据
    
    print(f"保存数据...")
    
    df.to_csv(f'data/{affiliation}临床试验注意清单.csv', index=False)

else:
    print(f"{affiliation}无临床试验注册项目")

55.3.1.1 执行

import subprocess

# 定义要执行的命令及其参数
command = ['python', 'project_registry_check.py', '茂名市人民医院']


# 执行命令
result = subprocess.run(command, stdout=subprocess.PIPE)

# 检查命令是否成功执行
if result.returncode == 0:
    print("Command executed successfully!")
    # 尝试使用不同的编码方式解码输出结果
    try:
        output = result.stdout.decode('utf-8')
    except UnicodeDecodeError:
        output = result.stdout.decode('gbk')  # 尝试使用 gbk 编码解码
    print(output)
else:
    print("Command failed with return code:", result.returncode)

55.3.2 渲染quarto book

由ipynb转换为qmd

import os
import subprocess
import sys

# 获取第一个参数
notebook = sys.argv[1]

if notebook == "all":
    # 获取当前目录下的所有.ipynb文件
    notebooks = [file for file in os.listdir() if file.endswith('.ipynb')]
    
    # 遍历每个.ipynb文件，并进行转换
    for notebook in notebooks:
        if not notebook.replace(".ipynb", "").endswith('_notebook'):
            if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")):  # render时出错未删除的notebook文件
                os.remove(notebook)
            raise ValueError("Input file must contain suffix of '_notebook'.")
        # 因quarto在render时会删除同名的ipynb文件，故qmd文件和ipynb文件不能同名
        qmd_file = notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
        # 构建转换命令
        command = f'quarto convert {notebook} --output {qmd_file}'
        # 执行转换命令
        subprocess.run(command, shell=True)
    # 执行渲染命令
    command = f'quarto render'
    # 执行转换命令
    subprocess.run(command, shell=True)

else:
    # 确保指定的文件为ipynb，并且存在
    if not notebook.endswith('.ipynb'):
        raise ValueError("Input file must be a .ipynb file.")
    elif not os.path.exists(notebook):
        raise ValueError("Input file must exist.")
    elif not notebook.replace(".ipynb", "").endswith('_notebook'):
        if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")):  # render时出错未删除的notebook文件
                os.remove(notebook)
        raise ValueError("Input file must contain suffix of '_notebook'.")
    else:
        # 因quarto在render时会删除同名的ipynb文件，故qmd文件和ipynb文件不能同名
        qmd_file = notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
        # 构建转换命令
        command = f'quarto convert {notebook} --output {qmd_file}'
        
        # 执行转换命令
        subprocess.run(command, shell=True)

        # 执行渲染命令
        command = f'quarto render {qmd_file}'
        # 执行转换命令
        subprocess.run(command, shell=True)

55.3.2.1 执行

全部执行

python render.py all

执行个别

python render.py xxx_notebook.ipynb

55.3.3 删除yaml的jupyter设置

在jupyter lab打开qmd文件，保存后会自动添加隐藏的jupyter的yaml设置（通过nano xxx.qmd可以看到），导致渲染部分内容会失败，如DT表格的交互性表格，故通过该脚本将jupyter的yaml设置替换成空格。

import re
import sys

def remove_jupyter_yaml(qmd_file):
    with open(qmd_file, 'r') as f:
        content = f.read()

    jupyter_yaml = re.findall(r'(jupyter:[\s\S]+)---', content)[0]

    ## [\s\S] 代表任意字符，包括换行符

    content = content.replace(jupyter_yaml, "")

    with open(qmd_file, 'w') as f:
        f.write(content)
    print("Success: Jupyter yaml has been removed.")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python remove_jupyter_yaml.py <qmd_file>")
        sys.exit(1)

    qmde_file = sys.argv[1]
    remove_jupyter_yaml(qmde_file)

55.3.3.1 执行

python remove_jupyter_yaml.py conclusion-huajuhong.qmd

55.3.4 批量转换ipynb文件至qmd后渲染

# render.py

import os
import subprocess
import sys

# 获取第一个参数
notebook = sys.argv[1]

if notebook == "all":
    # 获取当前目录下的所有.ipynb文件
    notebooks = [file for file in os.listdir() if file.endswith('.ipynb')]
    
    # 遍历每个.ipynb文件，并进行转换
    for notebook in notebooks:
        if not notebook.replace(".ipynb", "").endswith('_notebook'):
            if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")):  # render时出错未删除的notebook文件
                os.remove(notebook)
            raise ValueError("Input file must contain suffix of '_notebook'.")
        # 因quarto在render时会删除同名的ipynb文件，故qmd文件和ipynb文件不能同名
        qmd_file = notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
        # 构建转换命令
        command = f'quarto convert {notebook} --output ../{qmd_file}'
        # 执行转换命令
        subprocess.run(command, shell=True)
    # 执行渲染命令
    command = f'quarto render ../'
    # 执行转换命令
    subprocess.run(command, shell=True)

else:
    # 确保指定的文件为ipynb，并且存在
    if not notebook.endswith('.ipynb'):
        raise ValueError("Input file must be a .ipynb file.")
    elif not os.path.exists(notebook):
        raise ValueError("Input file must exist.")
    elif not notebook.replace(".ipynb", "").endswith('_notebook'):
        if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")):  # render时出错未删除的notebook文件
                os.remove(notebook)
        raise ValueError("Input file must contain suffix of '_notebook'.")
    else:
        # 因quarto在render时会删除同名的ipynb文件，故qmd文件和ipynb文件不能同名
        qmd_file = notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
        # 构建转换命令
        command = f'quarto convert {notebook} --output ../{qmd_file}'
        
        # 执行转换命令
        subprocess.run(command, shell=True)

        # 执行渲染命令
        command = f'quarto render ../{qmd_file}'
        # 执行转换命令
        subprocess.run(command, shell=True)

55.3.4.1 执行

cd notebook

python render.py all   # 渲染整个项目的所有文件

python render.py xxx_notebook.ipynb   # 渲染单个文件