重构:基础代码
This commit is contained in:
parent
75ee065752
commit
02bbaaa788
44
main.py
44
main.py
@ -1,8 +1,10 @@
|
||||
from flask import Flask, render_template, redirect, url_for
|
||||
from loguru import logger
|
||||
|
||||
from src.config import Config
|
||||
from src.generator import build_data
|
||||
import const
|
||||
import models
|
||||
import tools
|
||||
from generator import Generator
|
||||
|
||||
app = Flask(__name__)
|
||||
logger.add("endofyear.log")
|
||||
@ -10,28 +12,36 @@ logger.add("endofyear.log")
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
# 默认主题 painting
|
||||
# 重定向 painting
|
||||
return redirect(url_for('painting'))
|
||||
|
||||
|
||||
@app.route('/painting')
|
||||
def painting():
|
||||
if Config("config.ini").web_status:
|
||||
# web 服务
|
||||
# 如果数据存在,直接返回
|
||||
if blog_data := Config("config.ini").blog_data:
|
||||
return render_template('painting.html', data=blog_data, web_status=1)
|
||||
# 站点数据
|
||||
site = models.Site(
|
||||
service=const.SITE_SERVICE,
|
||||
title=const.SITE_NAME
|
||||
).to_dict()
|
||||
|
||||
# 如果数据不存在,需要生成,并写入配置
|
||||
return render_template('painting.html', data=build_data(), web_status=1)
|
||||
else:
|
||||
# Github 静态
|
||||
# 数据需要生成,并写入静态文件
|
||||
html_data = render_template('painting.html', data=build_data(), web_status=0)
|
||||
with open("static/index.html", "w") as f:
|
||||
f.write(html_data)
|
||||
# 自定义数据
|
||||
custom = models.Custom(
|
||||
yiyan=tools.get_yiyan()
|
||||
).to_dict()
|
||||
|
||||
return 'OK'
|
||||
# 初始化数据生成器
|
||||
generator = Generator("https://blog.7wate.com/rss.xml")
|
||||
|
||||
# 渲染模板
|
||||
return render_template('painting.html',
|
||||
site=site,
|
||||
blog=generator.blog(),
|
||||
special_post=generator.special_post(),
|
||||
sentiment_post=generator.sentiment_post(),
|
||||
long_post=generator.long_post(),
|
||||
short_post=generator.short_post(),
|
||||
custom=custom
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
145
src/analyzer.py
145
src/analyzer.py
@ -1,5 +1,3 @@
|
||||
from typing import Any
|
||||
|
||||
import jieba.analyse
|
||||
import pytz
|
||||
from dateutil.parser import parse
|
||||
@ -7,31 +5,52 @@ from loguru import logger
|
||||
from lunardate import LunarDate
|
||||
from snownlp import SnowNLP
|
||||
|
||||
import const
|
||||
|
||||
|
||||
# 计算文本内容情感分数
|
||||
def analyze_sentiment(text):
|
||||
def analyze_sentiment(keys):
|
||||
"""
|
||||
博客文章情感分计算(有点问题,酌情使用)
|
||||
:param text:文章文本
|
||||
博客文章情感分计算
|
||||
|
||||
:param keys:文章关键字
|
||||
:return:分数
|
||||
"""
|
||||
s = SnowNLP(text)
|
||||
return round(s.sentiments * 100)
|
||||
score_lists = [SnowNLP(key).sentiments for key in keys]
|
||||
all_score = sum(score_lists)
|
||||
|
||||
if len(score_lists) > 10:
|
||||
max_score = max(score_lists)
|
||||
min_score = min(score_lists)
|
||||
average_score = (all_score - max_score - min_score) / (len(keys) - 2)
|
||||
return int(average_score * 1000)
|
||||
elif 10 > len(score_lists) > 6:
|
||||
average_score = all_score / len(keys)
|
||||
return int(average_score * 900)
|
||||
elif 6 > len(score_lists) > 3:
|
||||
average_score = all_score / len(keys)
|
||||
return int(average_score * 800)
|
||||
elif 3 > len(score_lists) > 0:
|
||||
average_score = all_score / len(keys)
|
||||
return int(average_score * 500)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def classify_and_extract_keywords(text: str, topK: int, stopwords: str,
|
||||
tech_terms_file: str) -> tuple[None, list[Any]] | tuple[int, Any]:
|
||||
def extract_keywords(text,
|
||||
topK,
|
||||
stopwords):
|
||||
"""
|
||||
博客文章关键字提取
|
||||
文章关键字提取
|
||||
:param text:文章文本
|
||||
:param topK:关键字数量,建议20个
|
||||
:param stopwords:停词文本,去掉无意义词组
|
||||
:param tech_terms_file:专业词语,区分文章类目
|
||||
:param topK:关键字数量
|
||||
:param stopwords:停词文本(去掉无意义词组)
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
jieba.analyse.set_stop_words(stopwords)
|
||||
keywords = jieba.analyse.extract_tags(text, topK=topK)
|
||||
return keywords
|
||||
except ValueError as e:
|
||||
logger.error(f"关键词提取出错:{e}")
|
||||
return None, []
|
||||
@ -39,72 +58,52 @@ def classify_and_extract_keywords(text: str, topK: int, stopwords: str,
|
||||
logger.error(f"关键词提取出错:{e}")
|
||||
return None, []
|
||||
|
||||
|
||||
def check_category(tech_terms_file, keywords):
|
||||
"""
|
||||
文章分类判断
|
||||
:param keywords: 文章关键词
|
||||
:param tech_terms_file: 分类词典文件
|
||||
:return: 分类常量
|
||||
"""
|
||||
with open(tech_terms_file, 'r', encoding='utf-8') as f:
|
||||
tech_terms_set = {line.strip().lower() for line in f}
|
||||
tech_terms_set = {line.strip().lower() for line in f} # 读取分类词典文件,将其转化为小写并创建集合
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword.lower() in tech_terms_set:
|
||||
return 1, keywords
|
||||
if keyword.lower() in tech_terms_set: # 判断关键词是否在分类词典集合中
|
||||
return const.BLOG_POST_CATEGORY_TECH # 若关键词存在,则返回技术类分类常量
|
||||
|
||||
return 2, keywords
|
||||
return const.BLOG_POST_CATEGORY_LIFE # 若关键词不存在,则返回生活类分类常量
|
||||
|
||||
|
||||
def calculate_weight(time_str: str):
|
||||
def calculate_weight(time_str: str) -> int:
|
||||
"""
|
||||
博客文章特殊日期权重分数计算。
|
||||
- 传统节假日 +10
|
||||
- 节假日 +7
|
||||
- 凌晨 +5
|
||||
- 早上 +4
|
||||
- 下午 +3
|
||||
- 晚上 +2
|
||||
计算文章特殊日期的权重分数。
|
||||
- 传统节假日 +10
|
||||
- 节假日 +7
|
||||
- 凌晨 +5
|
||||
- 早上 +4
|
||||
- 下午 +3
|
||||
- 晚上 +2
|
||||
|
||||
:param time_str: 时间字符串
|
||||
:return:总分数,特殊日期
|
||||
:return: 总分数(整数)
|
||||
"""
|
||||
dt = parse(time_str)
|
||||
dt = dt.astimezone(pytz.timezone('Asia/Shanghai'))
|
||||
dt = dt.astimezone(pytz.timezone(const.TIME_ZONE))
|
||||
|
||||
weight = 0
|
||||
date_str = ""
|
||||
|
||||
# 农历节日权重计算
|
||||
LUNAR_HOLIDAYS = {
|
||||
(1, 1): '春节',
|
||||
(1, 15): '元宵节',
|
||||
(2, 2): '龙抬头',
|
||||
(5, 5): '端午节',
|
||||
(7, 7): '七夕节',
|
||||
(7, 15): '中元节',
|
||||
(8, 15): '中秋节',
|
||||
(9, 9): '重阳节',
|
||||
(12, 8): '腊八节',
|
||||
(12, 23): '小年',
|
||||
(12, 30): '除夕'
|
||||
}
|
||||
|
||||
# 计算农历节假日的权重
|
||||
lunar_date = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
|
||||
if (lunar_date.month, lunar_date.day) in LUNAR_HOLIDAYS:
|
||||
if (lunar_date.month, lunar_date.day) in const.LUNAR_HOLIDAYS:
|
||||
weight += 10
|
||||
date_str = LUNAR_HOLIDAYS[(lunar_date.month, lunar_date.day)]
|
||||
|
||||
# 公历节日权重计算
|
||||
SOLAR_HOLIDAYS = {
|
||||
(1, 1): '元旦',
|
||||
(2, 14): '情人节',
|
||||
(3, 8): '国际妇女节',
|
||||
(4, 4): '清明节',
|
||||
(5, 1): '国际劳动节',
|
||||
(10, 1): '国庆节',
|
||||
(12, 13): '南京大屠杀纪念日',
|
||||
(9, 18): '九一八事变纪念日',
|
||||
(12, 7): '南京保卫战胜利纪念日',
|
||||
(8, 15): '抗日战争胜利纪念日'
|
||||
}
|
||||
|
||||
if (dt.month, dt.day) in SOLAR_HOLIDAYS:
|
||||
# 计算公历节假日的权重
|
||||
if (dt.month, dt.day) in const.SOLAR_HOLIDAYS:
|
||||
weight += 7
|
||||
date_str = SOLAR_HOLIDAYS[(dt.month, dt.day)]
|
||||
|
||||
# 计算时间节点的权重
|
||||
if 22 <= dt.hour or dt.hour < 7:
|
||||
weight += 5
|
||||
elif 7 <= dt.hour < 12:
|
||||
@ -116,7 +115,25 @@ def calculate_weight(time_str: str):
|
||||
else:
|
||||
weight += 0
|
||||
|
||||
if not date_str:
|
||||
date_str = f"{dt.month}月{dt.day}日"
|
||||
return weight
|
||||
|
||||
return weight, date_str
|
||||
|
||||
def special_date_calculation(time_str):
|
||||
"""
|
||||
特殊日期计算。
|
||||
:param time_str: 时间字符串
|
||||
:return:总分数
|
||||
"""
|
||||
dt = parse(time_str)
|
||||
dt = dt.astimezone(pytz.timezone(const.TIME_ZONE))
|
||||
|
||||
# 农历节假日计算
|
||||
lunar_date = LunarDate.fromSolarDate(dt.year, dt.month, dt.day)
|
||||
if (lunar_date.month, lunar_date.day) in const.LUNAR_HOLIDAYS:
|
||||
return const.LUNAR_HOLIDAYS[(lunar_date.month, lunar_date.day)]
|
||||
|
||||
# 公历节假日计算
|
||||
if (dt.month, dt.day) in const.SOLAR_HOLIDAYS:
|
||||
return const.SOLAR_HOLIDAYS[(dt.month, dt.day)]
|
||||
|
||||
return f"{dt.month}月{dt.day}日"
|
||||
|
49
src/const.py
Normal file
49
src/const.py
Normal file
@ -0,0 +1,49 @@
|
||||
# 时区
|
||||
TIME_ZONE = "Asia/Shanghai"
|
||||
|
||||
# 时间格式
|
||||
FORMAT_TIME = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
# 站点服务模式
|
||||
SITE_SERVICE = 1
|
||||
|
||||
# 站点标题
|
||||
SITE_NAME = "EndOfYear"
|
||||
|
||||
# 博客文章分类-生活
|
||||
BLOG_POST_CATEGORY_LIFE = 1
|
||||
|
||||
# 博客文章分类-技术
|
||||
BLOG_POST_CATEGORY_TECH = 2
|
||||
|
||||
# 博客文章关键字数量
|
||||
BLOG_MAX_KEYS = 7
|
||||
|
||||
# 农历节假日
|
||||
LUNAR_HOLIDAYS = {
|
||||
(1, 1): '春节',
|
||||
(1, 15): '元宵节',
|
||||
(2, 2): '龙抬头',
|
||||
(5, 5): '端午节',
|
||||
(7, 7): '七夕节',
|
||||
(7, 15): '中元节',
|
||||
(8, 15): '中秋节',
|
||||
(9, 9): '重阳节',
|
||||
(12, 8): '腊八节',
|
||||
(12, 23): '小年',
|
||||
(12, 30): '除夕'
|
||||
}
|
||||
|
||||
# 公历节假日
|
||||
SOLAR_HOLIDAYS = {
|
||||
(1, 1): '元旦',
|
||||
(2, 14): '情人节',
|
||||
(3, 8): '妇女节',
|
||||
(4, 4): '清明节',
|
||||
(5, 1): '劳动节',
|
||||
(10, 1): '国庆节',
|
||||
(12, 13): '南京大屠杀纪念日',
|
||||
(9, 18): '九一八事变纪念日',
|
||||
(12, 7): '南京保卫战胜利纪念日',
|
||||
(8, 15): '抗日战争胜利纪念日'
|
||||
}
|
199
src/generator.py
199
src/generator.py
@ -1,97 +1,116 @@
|
||||
from collections import Counter
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .analyzer import analyze_sentiment, calculate_weight, classify_and_extract_keywords
|
||||
from .config import Config
|
||||
from .scraper import Blog
|
||||
from .tools import get_yiyan
|
||||
import models
|
||||
import scraper
|
||||
|
||||
|
||||
def build_data():
|
||||
"""
|
||||
目前只有一个主题,构建数据部分后期会再进行重构拆分
|
||||
:return: 网页渲染数据
|
||||
"""
|
||||
# 读取配置
|
||||
config = Config("config.ini")
|
||||
class Generator:
|
||||
|
||||
# 创建博客对象
|
||||
try:
|
||||
my_blog = Blog(config.rss_url)
|
||||
except Exception as e:
|
||||
logger.error(f"Feed 无法创建博客对象: {str(e)}")
|
||||
def __init__(self, rss):
|
||||
"""
|
||||
初始化Generator类
|
||||
:param rss: RSS链接
|
||||
"""
|
||||
try:
|
||||
self._my_blog = scraper.Blog(rss)
|
||||
logger.debug(self._my_blog)
|
||||
for i, post in enumerate(self._my_blog.post_lists, 1):
|
||||
logger.info(f"Post #{i}:")
|
||||
logger.info(post)
|
||||
except Exception as e:
|
||||
logger.error(f"Generator 无法创建 Blog 对象: {str(e)}")
|
||||
|
||||
def blog(self):
|
||||
"""
|
||||
获取博客信息
|
||||
:return: Blog字典
|
||||
"""
|
||||
return models.Blog(
|
||||
name=self._my_blog.title,
|
||||
link=self._my_blog.link,
|
||||
life=self._my_blog.life,
|
||||
article_count=self._my_blog.article_count,
|
||||
article_word_count=self._my_blog.article_word_count,
|
||||
top_post_keys=self._my_blog.keys,
|
||||
category=self._my_blog.category
|
||||
).to_dict()
|
||||
|
||||
def special_post(self):
|
||||
"""
|
||||
获取特殊日期的文章
|
||||
:return: Post字典
|
||||
"""
|
||||
max_item_special_date = self._get_post_with_max("special_date_score")
|
||||
return models.Post(
|
||||
title=max_item_special_date.title,
|
||||
content=max_item_special_date.content,
|
||||
keys=max_item_special_date.keys,
|
||||
time=max_item_special_date.time,
|
||||
date=max_item_special_date.date
|
||||
).to_dict()
|
||||
|
||||
def sentiment_post(self):
|
||||
"""
|
||||
获取情感最优文章
|
||||
:return: Post字典
|
||||
"""
|
||||
max_item_sentiment = self._get_post_with_max("sentiment_score")
|
||||
return models.Post(
|
||||
title=max_item_sentiment.title,
|
||||
content=max_item_sentiment.content,
|
||||
keys=max_item_sentiment.keys,
|
||||
time=max_item_sentiment.time,
|
||||
date=max_item_sentiment.date
|
||||
).to_dict()
|
||||
|
||||
def long_post(self):
|
||||
"""
|
||||
获取最长文章数据
|
||||
:return: Post字典
|
||||
"""
|
||||
max_item_long = self._get_post_with_max("word_count")
|
||||
return models.Post(
|
||||
title=max_item_long.title,
|
||||
content=max_item_long.content,
|
||||
keys=max_item_long.keys,
|
||||
time=max_item_long.time,
|
||||
date=max_item_long.date,
|
||||
).to_dict()
|
||||
|
||||
def short_post(self):
|
||||
"""
|
||||
获取最短文章数据
|
||||
:return: Post字典
|
||||
"""
|
||||
max_item_short = self._get_post_with_min("word_count")
|
||||
return models.Post(
|
||||
title=max_item_short.title,
|
||||
content=max_item_short.content,
|
||||
keys=max_item_short.keys,
|
||||
time=max_item_short.time,
|
||||
date=max_item_short.date,
|
||||
).to_dict()
|
||||
|
||||
def _get_post_with_max(self, score_attr):
|
||||
"""
|
||||
获取具有最大属性值的文章
|
||||
:param score_attr: 属性
|
||||
:return:
|
||||
"""
|
||||
max_score = max(getattr(post, score_attr) for post in self._my_blog.post_lists)
|
||||
max_posts = [post for post in self._my_blog.post_lists if getattr(post, score_attr) == max_score]
|
||||
if max_posts:
|
||||
return max_posts[0]
|
||||
return None
|
||||
|
||||
logger.debug(my_blog)
|
||||
|
||||
# 构建博客基本数据
|
||||
data = {
|
||||
"blog_name": my_blog.title,
|
||||
"blog_link": my_blog.link,
|
||||
"blog_article_count": my_blog.article_count,
|
||||
"blog_article_word_count": my_blog.article_word_count,
|
||||
"blog_end_yiyan": get_yiyan()
|
||||
}
|
||||
|
||||
if my_blog.life is None:
|
||||
data.update({
|
||||
"blog_life": 0
|
||||
})
|
||||
else:
|
||||
data.update({
|
||||
"blog_life_year": my_blog.life // 365,
|
||||
"blog_life_day": my_blog.life % 365,
|
||||
})
|
||||
|
||||
# 博客文章处理
|
||||
for i, post in enumerate(my_blog.post_lists(), 1):
|
||||
# 情感分
|
||||
post.score = analyze_sentiment(post.content)
|
||||
# 分类, 关键字
|
||||
post.category, post.keys = classify_and_extract_keywords(text=post.content, topK=21,
|
||||
stopwords='data/stop_words.txt',
|
||||
tech_terms_file='data/tech_terms.txt')
|
||||
# 权重, 日子计算
|
||||
post.weight, post.date = calculate_weight(post.time)
|
||||
|
||||
logger.info(f"Post #{i}:")
|
||||
logger.info(post)
|
||||
|
||||
# 博客文章权重计算
|
||||
weights = [post.weight for post in my_blog.post_lists()]
|
||||
max_weight = max(weights)
|
||||
max_item = [post for post in my_blog.post_lists() if post.weight == max_weight][0]
|
||||
|
||||
data.update({
|
||||
"blog_title": max_item.title,
|
||||
"blog_content": max_item.content[0:50],
|
||||
"blog_content_date": max_item.date,
|
||||
})
|
||||
|
||||
# 暂时只有一个主题
|
||||
# 博客关键词计算 5 个
|
||||
all_keys = []
|
||||
for post in my_blog.post_lists():
|
||||
all_keys.extend(post.keys)
|
||||
|
||||
keyword_counts = Counter(all_keys)
|
||||
top_keywords = keyword_counts.most_common(5)
|
||||
data.update({
|
||||
"blog_top_keywords": top_keywords
|
||||
})
|
||||
|
||||
# 博客分类计算
|
||||
categories = [post.category for post in my_blog.post_lists()]
|
||||
cat_counts = Counter(categories)
|
||||
most_common_cat = cat_counts.most_common(1)[0][0]
|
||||
|
||||
data.update({
|
||||
"blog_category": "技术" if most_common_cat == 1 else "生活"
|
||||
})
|
||||
|
||||
# 输出
|
||||
logger.debug(data)
|
||||
# 写入 config.ini 避免重复计算
|
||||
config.blog_data = data
|
||||
return data
|
||||
def _get_post_with_min(self, score_attr):
|
||||
"""
|
||||
获取具有最小属性值的文章
|
||||
:param score_attr:
|
||||
:return:
|
||||
"""
|
||||
min_score = min(getattr(post, score_attr) for post in self._my_blog.post_lists)
|
||||
min_posts = [post for post in self._my_blog.post_lists if getattr(post, score_attr) == min_score]
|
||||
if min_posts:
|
||||
return min_posts[0]
|
||||
return None
|
||||
|
80
src/models.py
Normal file
80
src/models.py
Normal file
@ -0,0 +1,80 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import List
|
||||
|
||||
@dataclass
|
||||
class Site:
|
||||
"""
|
||||
站点数据模型
|
||||
- service: 服务模式
|
||||
- title: 站点标题
|
||||
"""
|
||||
service: int
|
||||
title: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
将Site对象转换为字典形式
|
||||
"""
|
||||
return {k: v if not isinstance(v, Enum) else v.value for k, v in vars(self).items()}
|
||||
|
||||
@dataclass
|
||||
class Blog:
|
||||
"""
|
||||
博客数据模型
|
||||
- name:名称
|
||||
- link:链接
|
||||
- life:域名注册天数
|
||||
- article_count:博客文章总和
|
||||
- article_word_count:博客文章字数总和
|
||||
- top_post_keys:博客关键字
|
||||
- category:博客分类
|
||||
"""
|
||||
name: str
|
||||
link: str
|
||||
life: int
|
||||
article_count: int
|
||||
article_word_count: int
|
||||
top_post_keys: List[str]
|
||||
category: int
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
将Blog对象转换为字典形式
|
||||
"""
|
||||
return {k: v if not isinstance(v, Enum) else v.value for k, v in vars(self).items()}
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""
|
||||
文章数据模型
|
||||
- title:标题
|
||||
- content:内容
|
||||
- keys:关键字列表
|
||||
- date:日期字符串
|
||||
"""
|
||||
title: str
|
||||
content: str
|
||||
keys: List[str]
|
||||
time: str
|
||||
date: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
将Post对象转换为字典形式
|
||||
"""
|
||||
return {k: v if not isinstance(v, Enum) else v.value for k, v in vars(self).items()}
|
||||
|
||||
@dataclass
|
||||
class Custom:
|
||||
"""
|
||||
自定义数据模型
|
||||
- yiyan:一言
|
||||
"""
|
||||
yiyan: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""
|
||||
将Custom对象转换为字典形式
|
||||
"""
|
||||
return vars(self)
|
221
src/scraper.py
221
src/scraper.py
@ -1,85 +1,139 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
import feedparser
|
||||
from loguru import logger
|
||||
|
||||
from . import tools
|
||||
import analyzer
|
||||
import const
|
||||
import tools
|
||||
|
||||
|
||||
class Blog:
|
||||
def __init__(self, url):
|
||||
def __init__(self, rss):
|
||||
try:
|
||||
self.feed = feedparser.parse(url)
|
||||
# 解析RSS feed
|
||||
self._feed = feedparser.parse(rss)
|
||||
# 解析feed中的所有文章
|
||||
self._posts = [Post(entry) for entry in self._feed.entries]
|
||||
except Exception as e:
|
||||
logger.error(f'解析 RSS feed 时发生错误: {str(e)}')
|
||||
logger.error(f'Feedparser 解析 RSS feed 时发生错误: {str(e)}')
|
||||
raise
|
||||
self.posts = [Post(entry) for entry in self.feed.entries]
|
||||
|
||||
def _get_feed_field(self, field):
|
||||
"""
|
||||
从 RSS feed 中获取特定字段
|
||||
"""
|
||||
field_value = self.feed.feed.get(field)
|
||||
if field_value is None:
|
||||
logger.warning(f'{field} 字段不存在!')
|
||||
return field_value
|
||||
if field_value := self._feed.feed.get(field):
|
||||
return field_value
|
||||
logger.warning(f'Feedparser {field} 字段不存在!')
|
||||
return ""
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
return self._get_feed_field('title')
|
||||
# 获取RSS feed的标题
|
||||
return self._feed.feed.get('title')
|
||||
|
||||
@property
|
||||
def link(self):
|
||||
return self._get_feed_field('link')
|
||||
# 获取RSS feed的链接
|
||||
return self._feed.feed.get('link')
|
||||
|
||||
@property
|
||||
def life(self):
|
||||
domain = tools.get_domain(self.link)
|
||||
return tools.get_domain_life(domain)
|
||||
# 获取RSS feed链接的域名存活时间
|
||||
return tools.get_domain_life(self.link)
|
||||
|
||||
@property
|
||||
def article_count(self):
|
||||
return len(self.posts)
|
||||
# 获取文章数量
|
||||
return len(self._posts) if self._posts else 0
|
||||
|
||||
@property
|
||||
def article_word_count(self):
|
||||
return sum(post.word_count for post in self.posts)
|
||||
# 获取文章总字数
|
||||
return sum(post.word_count for post in self._posts) if self._posts else 0
|
||||
|
||||
@property
|
||||
def keys(self):
|
||||
if self._posts:
|
||||
# 提取所有关键字
|
||||
all_keys = [key for post in self._posts for key in post.keys]
|
||||
|
||||
# 过滤出中文关键字
|
||||
chinese_keys = [key for key in all_keys if re.search(r'[\u4e00-\u9fff]+', key)]
|
||||
|
||||
# 计算关键字出现的次数
|
||||
keyword_counts = Counter(chinese_keys)
|
||||
|
||||
# 提取出现次数最多的关键字
|
||||
top_keywords = keyword_counts.most_common(const.BLOG_MAX_KEYS)
|
||||
|
||||
return top_keywords
|
||||
|
||||
return []
|
||||
|
||||
@property
|
||||
def category(self):
|
||||
# 获取博客的分类
|
||||
if self._posts:
|
||||
# 如果博客有帖子
|
||||
categories = [post.category for post in self._posts]
|
||||
# 获取所有帖子的分类
|
||||
cat_counts = Counter(categories)
|
||||
# 统计每个分类的个数
|
||||
most_common_cat = cat_counts.most_common(1)[0][0]
|
||||
# 获取出现次数最多的分类
|
||||
return most_common_cat
|
||||
# 如果博客没有帖子
|
||||
return const.BLOG_POST_CATEGORY_LIFE
|
||||
|
||||
@property
|
||||
def post_lists(self):
|
||||
return self.posts
|
||||
# 获取文章列表
|
||||
return self._posts if self._posts else []
|
||||
|
||||
def __str__(self):
|
||||
return f"Blog: {self.title}, Life:{self.life}, Count{self.article_count}. Word count:{self.article_word_count}"
|
||||
return f"""
|
||||
博客: {self.title}
|
||||
链接: {self.link}
|
||||
时间: {self.life} 天
|
||||
文章: {self.article_count} 篇
|
||||
字数: {self.article_word_count} 个
|
||||
分类: {self.category}
|
||||
关键字: {self.keys}
|
||||
"""
|
||||
|
||||
|
||||
class Post:
|
||||
def __init__(self, entry):
|
||||
# 日期权重
|
||||
self._weight = None
|
||||
# 日子
|
||||
self._date = None
|
||||
# 情感分
|
||||
self._score = None
|
||||
# 关键字
|
||||
self._keys = None
|
||||
# 分类
|
||||
self._category = None
|
||||
self.entry = entry
|
||||
# 文章内容
|
||||
self._content = self._get_content()
|
||||
# 文章时间
|
||||
self._time = tools.format_datetime(self._get_entry_field('published'))
|
||||
# 文章日期
|
||||
self._date = analyzer.special_date_calculation(self._time)
|
||||
# 特殊日期分
|
||||
self._special_date_score = analyzer.calculate_weight(self._get_entry_field('published'))
|
||||
# 关键字
|
||||
self._keys = analyzer.extract_keywords(text=self._content,
|
||||
topK=tools.get_multiple_of_100(self._content),
|
||||
stopwords='data/stop_words.txt')
|
||||
# 文章情感分
|
||||
self._sentiment_score = analyzer.analyze_sentiment(self._keys)
|
||||
# 分类
|
||||
self._category = analyzer.check_category(tech_terms_file='data/tech_terms.txt', keywords=self._keys)
|
||||
|
||||
def _get_entry_field(self, field):
|
||||
"""
|
||||
从 RSS entry 中获取特定字段
|
||||
"""
|
||||
field_value = self.entry.get(field)
|
||||
if field_value is None:
|
||||
pass
|
||||
# logger.warning(f'{field} 字段不存在!')
|
||||
return field_value
|
||||
return self.entry.get(field)
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
return self._get_entry_field('title')
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
def _get_content(self):
|
||||
"""
|
||||
获取文章内容。
|
||||
:return: 文章的描述或内容,根据以下规则:
|
||||
- 如果'content'字段存在,那么返回'content'字段的值。
|
||||
- 如果'description'字段的长度小于128,并且'content'字段存在,那么返回'content'字段的值。
|
||||
- 否则,返回'description'字段的值。
|
||||
- 如果'description'和'content'字段都不存在,返回空字符串。
|
||||
"""
|
||||
description = self._get_entry_field('description')
|
||||
content = self._get_entry_field('content')
|
||||
if content:
|
||||
@ -94,60 +148,61 @@ class Post:
|
||||
return description
|
||||
|
||||
@property
|
||||
def time(self):
|
||||
return self._get_entry_field('published')
|
||||
def title(self):
|
||||
# 获取文章标题
|
||||
return self._get_entry_field('title')
|
||||
|
||||
@property
|
||||
def link(self):
|
||||
return self._get_entry_field('link')
|
||||
def content(self):
|
||||
# 获取文章内容
|
||||
return self._content
|
||||
|
||||
@property
|
||||
def word_count(self):
|
||||
# 获取文章字数
|
||||
return len(self.content) if self.content else 0
|
||||
|
||||
@property
|
||||
def keys(self):
|
||||
return self._keys
|
||||
|
||||
@keys.setter
|
||||
def keys(self, value):
|
||||
self._keys = value
|
||||
|
||||
@property
|
||||
def score(self):
|
||||
return self._score
|
||||
|
||||
@score.setter
|
||||
def score(self, value):
|
||||
self._score = value
|
||||
|
||||
@property
|
||||
def category(self):
|
||||
return self._category
|
||||
|
||||
@category.setter
|
||||
def category(self, value):
|
||||
self._category = value
|
||||
def time(self):
|
||||
# 获取文章时间
|
||||
return self._time
|
||||
|
||||
@property
|
||||
def date(self):
|
||||
# 获取日期分
|
||||
return self._date
|
||||
|
||||
@date.setter
|
||||
def date(self, value):
|
||||
self._date = value
|
||||
@property
|
||||
def link(self):
|
||||
# 获取文章链接
|
||||
return self._get_entry_field('link')
|
||||
|
||||
@property
|
||||
def weight(self):
|
||||
return self._weight
|
||||
def keys(self):
|
||||
# 获取文章关键字
|
||||
return self._keys
|
||||
|
||||
@weight.setter
|
||||
def weight(self, value):
|
||||
self._weight = value
|
||||
@property
|
||||
def category(self):
|
||||
# 获取文章分类
|
||||
return self._category
|
||||
|
||||
@property
|
||||
def special_date_score(self):
|
||||
# 获取特殊日期分
|
||||
return self._special_date_score
|
||||
|
||||
@property
|
||||
def sentiment_score(self):
|
||||
# 获取文章情感分
|
||||
return self._sentiment_score
|
||||
|
||||
def __str__(self):
|
||||
return (f"Post title={self.title[:20]}..., "
|
||||
f" content={self.content[:20]}..., "
|
||||
f" time={self.time}, "
|
||||
f" link={self.link}, "
|
||||
f" word_count={self.word_count}")
|
||||
return (f" 标题:{self.title}, "
|
||||
f" 内容:{self.content[:20]}..., "
|
||||
f" 时间:{self.time}, "
|
||||
f" 链接:{self.link}, "
|
||||
f" 日期分:{self.special_date_score}"
|
||||
f" 情感分:{self.sentiment_score}"
|
||||
f" 类目:{self.category}"
|
||||
f" 关键字:{self.keys}")
|
||||
|
37
src/tools.py
37
src/tools.py
@ -1,10 +1,14 @@
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from dateutil.parser import parse
|
||||
from loguru import logger
|
||||
|
||||
import const
|
||||
|
||||
|
||||
def check_website_status(url):
|
||||
"""
|
||||
@ -54,10 +58,10 @@ def get_domain_life(url):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
|
||||
}
|
||||
domain_url = f"https://rdap.verisign.com/com/v1/domain/{url}"
|
||||
domain = get_domain(url)
|
||||
|
||||
try:
|
||||
response = requests.get(domain_url, headers=headers, timeout=30)
|
||||
response = requests.get(f"https://rdap.verisign.com/com/v1/domain/{domain}", headers=headers, timeout=30)
|
||||
response.raise_for_status() # Raises stored HTTPError, if one occurred.
|
||||
|
||||
registration_date = response.json().get('events')[0].get('eventDate')
|
||||
@ -87,7 +91,7 @@ def get_domain_life(url):
|
||||
except Exception as err:
|
||||
logger.error(f"未预期的错误: {err}")
|
||||
|
||||
return None
|
||||
return 0
|
||||
|
||||
|
||||
def remove_html_tags(text):
|
||||
@ -105,7 +109,8 @@ def get_yiyan():
|
||||
:return:一言
|
||||
"""
|
||||
try:
|
||||
response = requests.get("https://v1.hitokoto.cn/?c=d&min_length=12&encode=text", timeout=30) # Set timeout to 5 seconds
|
||||
response = requests.get("https://v1.hitokoto.cn/?c=d&min_length=18&max_length=24&encode=text",
|
||||
timeout=30) # Set timeout to 5 seconds
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
@ -123,3 +128,27 @@ def get_yiyan():
|
||||
except Exception as e:
|
||||
logger.error(f"一言未知错误,错误:{e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_multiple_of_100(string):
|
||||
"""
|
||||
获取文章长度 100 的整除
|
||||
:return:建议关键字数量
|
||||
"""
|
||||
length = len(string)
|
||||
multiple = length // 100
|
||||
if multiple < 1:
|
||||
multiple = 1
|
||||
return multiple
|
||||
|
||||
|
||||
def format_datetime(dt_str):
|
||||
"""
|
||||
格式化时间字符串为指定格式
|
||||
:param dt_str:时间字符串
|
||||
:return:指定格式
|
||||
"""
|
||||
dt = parse(dt_str)
|
||||
tz = pytz.timezone(const.TIME_ZONE)
|
||||
formatted_dt = dt.astimezone(tz).strftime(const.FORMAT_TIME)
|
||||
return formatted_dt
|
||||
|
@ -3,8 +3,8 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>EndOfYear</title>
|
||||
{% if web_status == 1 %}
|
||||
<title>{{ site.title }}</title>
|
||||
{% if site.service == 1 %}
|
||||
<link rel="stylesheet" href="{{ url_for('static', filename='painting/css/normalize.css') }}">
|
||||
<link rel="stylesheet" href="{{ url_for('static', filename='painting/css/animate.min.css') }}">
|
||||
<link rel="stylesheet" href="{{ url_for('static', filename='painting/css/painting.css') }}">
|
||||
@ -19,7 +19,7 @@
|
||||
<body>
|
||||
<div class="container active animate__animated animate__fadeIn animate__slow" id="tab1">
|
||||
<audio id="bgm" loop>
|
||||
{% if web_status == 1 %}
|
||||
{% if site.service == 1 %}
|
||||
<source src="{{ url_for('static', filename='painting/music/bgm.mp3') }}" type="audio/mpeg">Your browser does
|
||||
not support the audio element.
|
||||
{% else %}
|
||||
@ -31,7 +31,7 @@
|
||||
<div class="notice">
|
||||
<h4>温馨提示</h4>
|
||||
<hr>
|
||||
<p>EndofYear 使用互联网上公开的 RSS 数据源,并使用自建的 Umami 服务统计访问量,绝对不会主动获取个人隐私信息。🫣🫣🫣
|
||||
<p>EndofYear 使用互联网公开的 RSS 数据源,并使用自建的 Umami 服务统计访问量,绝对不会主动获取个人隐私信息。🫣🫣🫣
|
||||
<br>
|
||||
<br>
|
||||
开启方式:小手轻轻点 ~
|
||||
@ -47,14 +47,14 @@
|
||||
</div>
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab2">
|
||||
<div class="tab2-box">
|
||||
<p class="animate__animated animate__fadeIn animate__delay-1s">亲爱的{{ data.blog_name }}</p>
|
||||
{% if data.blog_life == 0 %}
|
||||
<p class="animate__animated animate__fadeIn animate__delay-1s">亲爱的{{ blog.name }}</p>
|
||||
{% if blog.life == 0 %}
|
||||
<p class="animate__animated animate__fadeIn animate__delay-2s">旧事如梦,一年已过</p>
|
||||
<p class="animate__animated animate__fadeIn animate__delay-2s">贰三年、感谢有你!</p>
|
||||
{% else %}
|
||||
<p class="animate__animated animate__fadeIn animate__delay-2s">今天是我们相识的</p>
|
||||
<p class="animate__animated animate__fadeIn animate__delay-3s">第 <small>{{ data.blog_life_year }}</small> 年
|
||||
<small>{{ data.blog_life_day }}</small> 天</p>
|
||||
<p class="animate__animated animate__fadeIn animate__delay-3s">第 <small>{{ blog.life // 365 }}</small> 年
|
||||
<small>{{ blog.life % 365 }}</small> 天</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
@ -62,23 +62,23 @@
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab3">
|
||||
<div class="tab3-box">
|
||||
<p class="animate__animated animate__fadeInUp animate__delay-1s">这一年你写下了</p>
|
||||
<p class="animate__animated animate__fadeInUp animate__delay-2s"><small>{{ data.blog_article_count }}</small>
|
||||
<p class="animate__animated animate__fadeInUp animate__delay-2s"><small>{{ blog.article_count }}</small>
|
||||
篇博文</p>
|
||||
<p class="animate__animated animate__fadeInUp animate__delay-3s">
|
||||
<small>{{ data.blog_article_word_count }}</small> 个文字</p>
|
||||
<small>{{ blog.article_word_count }}</small> 个文字</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab4">
|
||||
<div class="tab4-box">
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-1s">{{ data.blog_content_date }}那天,你写下了</p>
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-2s">{{ data.blog_title }}</p>
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-3s">{{ data.blog_content }}<small>……</small>
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-1s">{{ special_post.date }}那天,你写下了</p>
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-2s">{{ special_post.title }}</p>
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-3s">{{ special_post.content[:50] }}<small>……</small>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab5">
|
||||
<div class="tab5-box">
|
||||
{% for keyword in data.blog_top_keywords %}
|
||||
{% for keyword in blog.top_post_keys[0:5] %}
|
||||
<p>{{ keyword[0] }}</p>
|
||||
{% endfor %}
|
||||
<p class="animate__animated animate__fadeInDown animate__delay-1s">这些都是<small>你</small>的</p>
|
||||
@ -87,13 +87,15 @@
|
||||
</div>
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab6">
|
||||
<div class="tab6-box">
|
||||
<p class="animate__animated animate__fadeInLeft animate__delay-1s">热爱{{ data.blog_category }}的你</p>
|
||||
<p class="animate__animated animate__fadeInLeft animate__delay-1s">
|
||||
热爱{% if blog.category == 1 %}生活{% else %}技术{% endif %}的你
|
||||
</p>
|
||||
<p class="animate__animated animate__fadeInLeft animate__delay-2s">一定要继续砥砺前行!</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="container animate__animated animate__fadeIn animate__slow" id="tab7">
|
||||
<div class="tab7-box">
|
||||
<p id="yiyan"> {{ data.blog_end_yiyan }}</p>
|
||||
<p id="yiyan"> {{ custom.yiyan }}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user