为了把之前写的博客整理到doc文档中,写了个脚本,把标题和对应链接一起爬出来了,这里做个存档。
[!note]
仅适用于typecho博客引擎
import requests
import re
write_content = ''
regex = r"<a \n    href=\"https://linjoey\.cn/index\.php/archives/(\d{1,3})/\">[\s]*(.*)[\s]*</a>"
index = 1
while(index < 27):
  url=f'https://linjoey.cn/index.php/page/{index}'
  html=requests.get(url)
  if html.status_code == 200:
    html_bytes = html.content
    html_str = html_bytes.decode()
    index += 1
  if html.status_code != 200:
    print(html)
    break
  all_items = re.findall(regex,html_str)
  for item in all_items:
    write_content=f'{write_content}\n{item[1]}\nhttps://linjoey.cn/index.php/archives/{item[0]}/\n'
  with open('mytitle_out.txt','w',encoding='utf-8') as f:
    f.write(write_content)
input("Press Any Key")