自己写个爬虫实现博客文章搜索功能

正安在线

2024-3-12

编程代码

最终效果

一. 前言

对于自己的CSDN文章，由于只能工具标题关键字搜索，如果文章很多的话，记不清文章标题，这个时候搜索就会比较麻烦，那就自己做一个搜索功能。

二. 思路

把自己的CSDN文章都爬到本地
写个web服务器，负责执行搜索任务
写个静态html页面，负责输入搜索的关键字和展示搜索结果

三. 撸起袖子写代码

目录结构如下

1 python爬虫，爬自己的CSDN文章

python版本：3.8
需要安装库：requests、BeautifulSoup
csdn_spider.py

# csdn_spider.py
import requests
from bs4 import BeautifulSoup
import base64
import os
import re
import json
from threading import Thread
import time

# 总页数
PAGE_CNT = 16
BLOG_URL = 'https://blog.csdn.net/linxinfa/article/list/'
DOWNLOADD_DIR = 'articles/'
page_done_cnt = 0
article_cnt = 0
title2url = None

def get_all_aritcles(page):
global title2url
global page_done_cnt
global article_cnt
web_url = BLOG_URL + str(page)
r = requests.get(web_url)
html_txt = r.text
# 去除换行
html_txt = html_txt.replace('<br>', '').replace('<br/>', '')

soup = BeautifulSoup(html_txt, 'lxml')
tag_main =soup.find('main')
tag_div = tag_main.find('div', class_='article-list')
if tag_div:
tag_h4_all = tag_div.find_all('h4')

for tag_h4 in tag_h4_all:
tag_a = tag_h4.find('a')
if 'linxinfa' in tag_a['href']:
url = tag_a['href']
# 这里不能直接用tag_a.string取出文章标题，因为a节点中含有子节点，需要做extract处理
tag_a = [s.extract() for s in tag_a]
# 取出标题
title = tag_a[1]
# 去除前后空格
title = title.strip()

print('%s %s: %s'%(article_cnt, title, url))
encode_title = base64.b64encode(title.encode('utf-8'))
encode_title = str(encode_title, encoding="utf-8").replace('/','%')
download_article(encode_title, url)
title2url[encode_title] = url
article_cnt = article_cnt + 1
page_done_cnt = page_done_cnt + 1

def download_article(title, url):
r = requests.get(url)
# 只取article标签内的内容
match = re.search('<article class="baidu_pl">((.|\n)*)</article>', r.text)
if match:
content_txt = match.group(1)
# 文章标题使用base64编码

# 以base64编码后的标题作为文件名，保存文章内容到本地
fw = open(DOWNLOADD_DIR + title + '.html', 'w', encoding = 'utf-8')

fw.write('<article class="baidu_pl">' + content_txt + '</article>')
fw.close()

def create_download_dir():
if not os.path.exists(DOWNLOADD_DIR):
os.mkdir(DOWNLOADD_DIR)

def save_title2url_json():
global title2url
txt = json.dumps(title2url, indent=2)
fw = open('title2url.json', 'w')
fw.write(txt)
fw.close()

def search_article(search_txt):
global title2url
target_articles = []
if None == title2url:
fr = open('title2url.json', 'r')
title2url_txt = fr.read()
title2url = json.loads(title2url_txt)
for root, dirs, fs in os.walk(DOWNLOADD_DIR):
for f in fs:
fr = open(os.path.join(root, f), 'r', encoding='utf-8')
article_txt = fr.read()
fr.close()
if re.search(search_txt, article_txt):
k = f.replace('.html', '').replace('%','/')
if k in title2url:
target_articles.append({"title":str(base64.b64decode(k),"utf-8"), "url":title2url[k]})
return target_articles

if '__main__' == __name__:
title2url = {}
page_done_cnt = 0
article_cnt = 0
create_download_dir()
for i in range(1, PAGE_CNT + 1):
thread = Thread(target=get_all_aritcles, args=(i,))
thread.setDaemon(True)
thread.start()
while page_done_cnt < PAGE_CNT:
time.sleep(0.5)

save_title2url_json()

print("done")

执行后，把文章爬到本地的articles目录中
注：文件名是文章标题的base64编码（其中把/替换成%，否则会提示非法文件名）

然后还会生成一份标题到链接的映射表

2. web服务器

使用python的tornado框架，负责执行搜索任务
需要安装库：tornado
csdn_articles.py

# csdn_articles.py
import tornado.ioloop
import tornado.web
import json
import os
import re
import platform
import csdn_spider

PORT = 8987

class BaseHandler(tornado.web.RequestHandler):
def set_default_headers(self):
self.set_header("Access-Control-Allow-Origin", "*") # 这个地方可以写域名
self.set_header("Access-Control-Allow-Headers", "x-requested-with")
self.set_header('Access-Control-Allow-Methods', 'POST, GET, OPTIONS')

def post(self):
self.write('some post')

def get(self):
self.write('some get')

def options(self):
# no body
self.set_status(204)
self.finish()

class SearchHandler(BaseHandler):
def post(self):
jd = get_json_from_bytes(self.request.body)
print('SearchHandler.post', jd)
if None == jd:
self.write('data error')
else:
txt = jd['txt']
target_articles = csdn_spider.search_article(txt)
print(target_articles)
self.write(json.dumps(target_articles))

def get_json_from_bytes(data_bytes):
txt = str(data_bytes, encoding = "utf8")
if is_json(txt):
return json.loads(txt)
else:
print('get_json_from_bytes Error:', txt)
return None

def is_json(txt):
try:
json.loads(txt)
except ValueError:
return False
return True

def make_app():
return tornado.web.Application([
(r'/search_article', SearchHandler),

])

if __name__ == "__main__":
# 解决python3.8问题
if platform.system() == "Windows":
import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

app = make_app()
app.listen(PORT)
print('start web server successfully, port:', PORT)
tornado.ioloop.IOLoop.current().start()

3. 静态网页

负责输入搜索的关键字和展示搜索结果
browser.html

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>林新发CSDN博客搜索</title>
<link href="browser.css" type="text/css" rel="stylesheet" />

</head>
<body>
<h1>林新发CSDN博客搜索</h1>

请输入关键字 <input id="txt" type="text" name="version" value="">

<button onclick="OnSearchBtn()">搜索</button>

<table cellspacing="1" width="100%">
<thead>
<tr>
<th width="10%">index</th>
<th width="50%">文章</th>
</tr>
</thead>
<tbody id='article_table'>
</tbody>
</table>

</html>

<script>
var ADDRESS = 'http://127.0.0.1:8987/';

function isJSON(str) {
if (typeof str == 'string') {
try {
JSON.parse(str);
return true;
} catch(e) {
return false;
}
}
console.log('It is not a string!')
}

function OnSearchBtn() {
var txt = document.getElementById('txt');
var version = txt.value;
data = { 'txt': txt.value };
var request_url = ADDRESS + 'search_article';
var request = new XMLHttpRequest();
request.open('POST', request_url, true);

request.send(JSON.stringify(data));
request.onload = function(e) {
console.log("请求成功");
console.log(request.response);
if (request.status === 200) {
console.log(request.responseText);

var tbody = document.querySelector('tbody');
var childs = tbody.childNodes;
for(var i = childs .length - 1; i >= 0; i--)
{
tbody.removeChild(childs[i]);
}

if(!isJSON(request.response))
{
return;
}
jd = JSON.parse(request.response);

for (var i = 0; i < jd.length; i++) {
//创建行tr
var tr = document.createElement('tr');

//将新创建的行tr添加给tbody
tbody.appendChild(tr);

var td = document.createElement('td');
td.innerHTML = i + 1;
//给tr添加td子元素
tr.appendChild(td);

var td = document.createElement('td');
var a = document.createElement('a');
a.innerHTML = jd[i]['title'];
a.href = jd[i]['url'];
a.target = 'blank';
td.appendChild(a);
tr.appendChild(td);
}
}
else{
console.log('search_article error, request.status: ' + request.status);
}
}
request.onerror = function(e) {
console.log('search_article error, status:' + request.status + ', statusText: ' + request.statusText);
}

}

</script>

browser.css

table {
border-collapse: collapse;
text-align: center;
}

td,
th {
border: 1px solid #333;
text-align: left;
}

thead tr {
background-color: #004B97;
color: white;
}