# 脚本文件名: myscript.py
import sys
# 获取命令行参数
= sys.argv
arguments
# 第一个参数是脚本的名称,因此实际的参数从第二个开始
if len(arguments) < 2:
print("Usage: python myscript.py <arg1> <arg2> ...")
1)
sys.exit(
# 打印传递的参数
print("Number of arguments:", len(arguments))
print("Argument List:", str(arguments))
# 从第二个参数开始遍历
for arg in arguments[1:]:
print("Argument:", arg)
55 脚本
55.1 脚本传参数
55.2 执行方法
55.2.1 终端里执行
python myscript.py arg1 arg2 arg3
55.2.2 python代码执行
import subprocess
# 定义要执行的命令及其参数
= ['python', 'myscript.py', 'arg1', 'arg2', 'arg3']
command
# 执行命令
= subprocess.run(command, stdout=subprocess.PIPE)
result
# 检查命令是否成功执行
if result.returncode == 0:
print("Command executed successfully!")
# 尝试使用不同的编码方式解码输出结果
try:
= result.stdout.decode('utf-8')
output except UnicodeDecodeError:
= result.stdout.decode('gbk') # 尝试使用 gbk 编码解码
output print(output)
else:
print("Command failed with return code:", result.returncode)
55.3 实例
55.3.1 临床试验注册
# 脚本文件名: project_registry_check.py
import sys
import os
import math
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd
from bs4 import BeautifulSoup
import yagmail
from myconfig import mail_user, mail_password
# 设置 Chrome 驱动路径
= '/usr/lib/chromium-browser/chromedriver'
chrome_driver_path
# 设置 Chrome 选项
= Options()
chrome_options '--headless') # 无头模式,不显示浏览器界面
chrome_options.add_argument('--no-sandbox') #加上这个参数才正常启动
chrome_options.add_argument(
# 创建 Chrome WebDriver 实例
= Service(chrome_driver_path)
service = webdriver.Chrome(service=service, options=chrome_options)
driver
# 获取命令行参数
= sys.argv
arguments
= arguments[1]
affiliation
## 获取网页html内容
def get_page_content(url, driver=driver):
# 请求网页
driver.get(url)
# 获取完整页面内容
= driver.page_source
page_content
if page_content:
print(f"获取页面内容...")
return page_content
## 提取html信息
def extract_info_from_page_content(page_content, driver = driver):
### 获取一般信息
print(f"获取临床研究一般信息...")
= pd.read_html(page_content)[0].drop(columns = '历史版本').drop(columns = "注册题目")
general
### 获取链接
print(f"获取临床研究链接...")
= BeautifulSoup(page_content, features="lxml")
soup = soup.find('table')
table = []
links = []
registration_nos = []
titles
for tr in table.find_all('tr'):
# print(tr)
= tr.find_all('td')
tds if tds:
# print(tds)
= tds[1].text
registration_no = f"https://www.chictr.org.cn/{tds[2].find('a')['href']}"
url = tds[2].find('a')['title']
title
links.append(url)
registration_nos.append(registration_no)
titles.append(title)
= pd.DataFrame({'链接': links,
link_df '注册号':registration_nos,
'注册题目':titles})
### 获取项目负责人信息
print(f"获取临床研究负责人...")
= []
registration_nos = []
directors for url in link_df.链接.values:
driver.get(url)# 获取完整页面内容
= driver.page_source
page_content = pd.read_html(page_content)[0]
df1 = df1.iloc[0,1]
registration_no = pd.read_html(page_content)[1]
df2 = df2.iloc[0, 3]
director
registration_nos.append(registration_no)
directors.append(director)
= pd.DataFrame({'项目负责人': directors,
director_df '注册号':registration_nos})
# 将三个表通过registration_no拼接
= general.merge(link_df).merge(director_df)
merge
return merge
## 发送邮件
def send_info(send_info_html):
= yagmail.SMTP(user=mail_user, password= mail_password, host='smtp.163.com')
yag
# 发送邮件
# 多个邮箱地址
= ['hulinhui@live.cn', 'mmkejiaoke@163.com', 'hym202008@163.com']
recipient_list
=recipient_list,
yag.send(to=f'提醒:{affiliation}有新的临床试验注册,需确认项目负责人有无及时备案',
subject= send_info_html)
contents
# 关闭SMTP连接
yag.close()
def extract_page_count(page_content):
'''
获取查询结果页数
'''
= BeautifulSoup(page_content, features="lxml")
soup = soup.find('span', id ='data-total').text
item_count
return math.ceil(int(item_count)/10)
= []
registration_table
# 获取页数
print(f"获取查询结果的页数...")
= f'https://www.chictr.org.cn/searchproj.html?sponsor={affiliation}'
url = get_page_content(url)
page_content = extract_page_count(page_content)
page_count
print(f"获取查询结果的页数为:{page_count}。")
if page_count != 0:
# 处理第1页
print(f"处理第1页列表")
= extract_info_from_page_content(page_content)
registration_info
registration_table.append(registration_info)
# 由于新的内容总是在第1页,且注册内容不会很多,故在全部取齐注册项目后,只要取第1页的内容就行
= f'data/{affiliation}临床试验注意清单.csv'
data_file if not os.path.exists(data_file):
if page_count >= 2:
for page_num in range(1, page_count):
print(f"处理第{page_num+1}页列表")
= f'https://www.chictr.org.cn/searchproj.html?sponsor={affiliation}&page={page_num+1}'
url
= get_page_content(url)
page_content = extract_info_from_page_content(page_content)
registration_info
registration_table.append(registration_info)else:
pass
# 关闭浏览器
driver.quit()
## 合并与保存
= pd.concat(registration_table, ignore_index = True)
df
## 判断有无新注册项目
if os.path.exists(data_file):
= pd.read_csv(data_file)['注册号']
old_registration_no else:
= []
old_registration_no
= df['注册号']
new_registration_no
= set(new_registration_no) - set(old_registration_no)
diff_registration_no
# 如果存在新注册,则将信息导出为html文本,通过邮箱发送
if diff_registration_no:
print(f"{affiliation}有新注册临床研究,已发送至邮箱。")
= df[df['注册号'].isin(diff_registration_no)].to_html()
send_info_html
send_info(send_info_html)else:
print(f"{affiliation}无新注册临床研究")
# 保存数据
print(f"保存数据...")
f'data/{affiliation}临床试验注意清单.csv', index=False)
df.to_csv(
else:
print(f"{affiliation}无临床试验注册项目")
55.3.1.1 执行
import subprocess
# 定义要执行的命令及其参数
= ['python', 'project_registry_check.py', '茂名市人民医院']
command
# 执行命令
= subprocess.run(command, stdout=subprocess.PIPE)
result
# 检查命令是否成功执行
if result.returncode == 0:
print("Command executed successfully!")
# 尝试使用不同的编码方式解码输出结果
try:
= result.stdout.decode('utf-8')
output except UnicodeDecodeError:
= result.stdout.decode('gbk') # 尝试使用 gbk 编码解码
output print(output)
else:
print("Command failed with return code:", result.returncode)
55.3.2 渲染quarto book
由
ipynb
转换为qmd
import os
import subprocess
import sys
# 获取第一个参数
= sys.argv[1]
notebook
if notebook == "all":
# 获取当前目录下的所有.ipynb文件
= [file for file in os.listdir() if file.endswith('.ipynb')]
notebooks
# 遍历每个.ipynb文件,并进行转换
for notebook in notebooks:
if not notebook.replace(".ipynb", "").endswith('_notebook'):
if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")): # render时出错未删除的notebook文件
os.remove(notebook)raise ValueError("Input file must contain suffix of '_notebook'.")
# 因quarto在render时会删除同名的ipynb文件,故qmd文件和ipynb文件不能同名
= notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
qmd_file # 构建转换命令
= f'quarto convert {notebook} --output {qmd_file}'
command # 执行转换命令
=True)
subprocess.run(command, shell# 执行渲染命令
= f'quarto render'
command # 执行转换命令
=True)
subprocess.run(command, shell
else:
# 确保指定的文件为ipynb,并且存在
if not notebook.endswith('.ipynb'):
raise ValueError("Input file must be a .ipynb file.")
elif not os.path.exists(notebook):
raise ValueError("Input file must exist.")
elif not notebook.replace(".ipynb", "").endswith('_notebook'):
if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")): # render时出错未删除的notebook文件
os.remove(notebook)raise ValueError("Input file must contain suffix of '_notebook'.")
else:
# 因quarto在render时会删除同名的ipynb文件,故qmd文件和ipynb文件不能同名
= notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
qmd_file # 构建转换命令
= f'quarto convert {notebook} --output {qmd_file}'
command
# 执行转换命令
=True)
subprocess.run(command, shell
# 执行渲染命令
= f'quarto render {qmd_file}'
command # 执行转换命令
=True) subprocess.run(command, shell
55.3.2.1 执行
- 全部执行
python render.py all
- 执行个别
python render.py xxx_notebook.ipynb
55.3.3 删除yaml的jupyter设置
在jupyter lab打开qmd文件,保存后会自动添加隐藏的jupyter的yaml设置(通过nano xxx.qmd可以看到),导致渲染部分内容会失败,如DT表格的交互性表格,故通过该脚本将jupyter的yaml设置替换成空格。
import re
import sys
def remove_jupyter_yaml(qmd_file):
with open(qmd_file, 'r') as f:
= f.read()
content
= re.findall(r'(jupyter:[\s\S]+)---', content)[0]
jupyter_yaml
## [\s\S] 代表任意字符,包括换行符
= content.replace(jupyter_yaml, "")
content
with open(qmd_file, 'w') as f:
f.write(content)print("Success: Jupyter yaml has been removed.")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python remove_jupyter_yaml.py <qmd_file>")
1)
sys.exit(
= sys.argv[1]
qmde_file remove_jupyter_yaml(qmde_file)
55.3.3.1 执行
python remove_jupyter_yaml.py conclusion-huajuhong.qmd
55.3.4 批量转换ipynb文件至qmd后渲染
# render.py
import os
import subprocess
import sys
# 获取第一个参数
= sys.argv[1]
notebook
if notebook == "all":
# 获取当前目录下的所有.ipynb文件
= [file for file in os.listdir() if file.endswith('.ipynb')]
notebooks
# 遍历每个.ipynb文件,并进行转换
for notebook in notebooks:
if not notebook.replace(".ipynb", "").endswith('_notebook'):
if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")): # render时出错未删除的notebook文件
os.remove(notebook)raise ValueError("Input file must contain suffix of '_notebook'.")
# 因quarto在render时会删除同名的ipynb文件,故qmd文件和ipynb文件不能同名
= notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
qmd_file # 构建转换命令
= f'quarto convert {notebook} --output ../{qmd_file}'
command # 执行转换命令
=True)
subprocess.run(command, shell# 执行渲染命令
= f'quarto render ../'
command # 执行转换命令
=True)
subprocess.run(command, shell
else:
# 确保指定的文件为ipynb,并且存在
if not notebook.endswith('.ipynb'):
raise ValueError("Input file must be a .ipynb file.")
elif not os.path.exists(notebook):
raise ValueError("Input file must exist.")
elif not notebook.replace(".ipynb", "").endswith('_notebook'):
if os.path.exists(notebook.replace(".ipynb", "_notebook.ipynb")): # render时出错未删除的notebook文件
os.remove(notebook)raise ValueError("Input file must contain suffix of '_notebook'.")
else:
# 因quarto在render时会删除同名的ipynb文件,故qmd文件和ipynb文件不能同名
= notebook.replace("_notebook", "").replace('.ipynb', '.qmd')
qmd_file # 构建转换命令
= f'quarto convert {notebook} --output ../{qmd_file}'
command
# 执行转换命令
=True)
subprocess.run(command, shell
# 执行渲染命令
= f'quarto render ../{qmd_file}'
command # 执行转换命令
=True) subprocess.run(command, shell
55.3.4.1 执行
cd notebook
python render.py all # 渲染整个项目的所有文件
python render.py xxx_notebook.ipynb # 渲染单个文件