Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 80 additions & 13 deletions .github/workflows/run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: arXiv-daily-ai-enhanced

on:
schedule:
- cron: "30 1 * * *"
- cron: "0,10,20,30,40,50 1 * * *"
workflow_dispatch:


Expand All @@ -20,12 +20,27 @@ jobs:
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv sync

- name: Schedule guard
id: schedule_guard
run: |
today=$(TZ=Asia/Shanghai date "+%Y-%m-%d")
echo "crawl_date=$today" >> $GITHUB_OUTPUT
should_run=true

if [ "${{ github.event_name }}" = "schedule" ] && [ -f "report/${today}.md" ]; then
should_run=false
echo "今日 report 已存在,跳过重复定时运行 / Today's report exists, skip duplicate scheduled run"
fi

echo "should_run=$should_run" >> $GITHUB_OUTPUT

- name: Crawl arXiv papers
id: crawl_step
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
source .venv/bin/activate
today=$(date -u "+%Y-%m-%d")
today=${{ steps.schedule_guard.outputs.crawl_date }}
echo "开始爬取 $today 的arXiv论文... / Starting to crawl $today arXiv papers..."

# 检查今日文件是否已存在,如存在则删除 / Check if today's file exists, delete if found
Expand Down Expand Up @@ -60,11 +75,13 @@ jobs:

- name: Check for duplicates
id: dedup_check
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
source .venv/bin/activate
echo "执行去重检查... / Performing intelligent deduplication check..."

cd daily_arxiv
export CRAWL_DATE="${{ steps.crawl_step.outputs.crawl_date }}"
# 执行去重检查脚本 / Execute intelligent deduplication check script
set +e # 暂时允许命令失败 / Temporarily allow command failure
python daily_arxiv/check_stats.py
Expand Down Expand Up @@ -98,7 +115,7 @@ jobs:
esac

- name: AI Enhancement Processing
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
source .venv/bin/activate
today=${{ steps.crawl_step.outputs.crawl_date }}
Expand All @@ -120,7 +137,7 @@ jobs:
echo "AI增强处理完成 / AI enhancement processing completed"

- name: Convert to Markdown
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
source .venv/bin/activate
today=${{ steps.crawl_step.outputs.crawl_date }}
Expand Down Expand Up @@ -149,16 +166,65 @@ jobs:
exit 1
fi
echo "Markdown转换完成 / Markdown conversion completed"

- name: Generate keyword daily report
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
source .venv/bin/activate
today=${{ steps.crawl_step.outputs.crawl_date }}
export LANGUAGE="${{ vars.LANGUAGE }}"
export GOOGLE_API_KEY="${{ secrets.GOOGLE_API_KEY }}"
export NOTEBOOKLM_MODEL="${{ vars.NOTEBOOKLM_MODEL }}"
export NOTEBOOKLM_FALLBACK_MODELS="${{ vars.NOTEBOOKLM_FALLBACK_MODELS }}"
AI_FILE="data/${today}_AI_enhanced_${LANGUAGE}.jsonl"
RAW_FILE="data/${today}.jsonl"

echo "生成关键词日报... / Generating keyword daily report..."
if [ -f "$AI_FILE" ]; then
python report/generate_keyword_report.py --input "$AI_FILE" --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
elif [ -f "$RAW_FILE" ]; then
python report/generate_keyword_report.py --input "$RAW_FILE" --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
else
echo "未找到数据文件,生成占位日报... / No data file found, generating placeholder report..."
python report/generate_keyword_report.py --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
fi
echo "关键词日报生成完成 / Keyword daily report generated"

- name: Send daily report email
if: steps.schedule_guard.outputs.should_run == 'true'
env:
SMTP_HOST: ${{ secrets.SMTP_HOST }}
SMTP_PORT: ${{ vars.SMTP_PORT }}
SMTP_USER: ${{ secrets.SMTP_USER }}
SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }}
SMTP_FROM: ${{ vars.SMTP_FROM }}
REPORT_EMAIL_TO: ${{ vars.REPORT_EMAIL_TO }}
SMTP_USE_SSL: ${{ vars.SMTP_USE_SSL }}
run: |
source .venv/bin/activate
today=${{ steps.crawl_step.outputs.crawl_date }}

if [ -z "$SMTP_HOST" ] || [ -z "$SMTP_USER" ] || [ -z "$SMTP_PASSWORD" ]; then
echo "跳过邮件发送:SMTP_HOST/SMTP_USER/SMTP_PASSWORD 未完整配置 / Skip email: SMTP secrets missing"
exit 0
fi

python report/send_daily_email.py \
--date "$today" \
--report-file "report/${today}.md" \
--source-file "report/author_source_guides/${today}.md" \
--default-to "wangzhuoxuan@seu.edu.cn"
echo "邮件发送完成 / Daily email sent"

- name: Update file list
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
echo "更新文件列表... / Updating file list..."
ls data/*.jsonl | sed 's|data/||' > assets/file-list.txt
echo "文件列表更新完成 / File list updated"

- name: Generate password hash and inject into config
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
echo "🔐 Generating password hash for authentication..."

Expand Down Expand Up @@ -192,6 +258,7 @@ jobs:
echo "🔐 Authentication setup complete"

- name: Summary
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
if [ "${{ steps.dedup_check.outputs.has_new_content }}" = "true" ]; then
echo "✅ 工作流完成:去重发现新内容并成功处理 / Workflow completed: Smart deduplication found new content and processed successfully"
Expand All @@ -213,7 +280,7 @@ jobs:
fi

- name: Inject repository info into data-config.js
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
echo "注入仓库信息到 data-config.js... / Injecting repository info into data-config.js..."

Expand All @@ -235,7 +302,7 @@ jobs:
fi

- name: Commit code changes to main branch
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
git config --global user.email "${{ vars.EMAIL }}"
git config --global user.name "${{ vars.NAME }}"
Expand All @@ -252,12 +319,12 @@ jobs:
if git diff --staged --quiet; then
echo "没有代码变更需要提交 / No code changes to commit"
else
git commit -m "chore: update data-config.js with repository info"
git commit -m "chore: update workflow config and daily report"
echo "代码变更已提交到 main 分支 / Code changes committed to main branch"
fi

- name: Push code changes to main branch
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true'
run: |
# 设置Git配置以处理自动合并 / Set Git config for automatic merging
git config pull.rebase true
Expand All @@ -280,7 +347,7 @@ jobs:
done

- name: Prepare data files for data branch
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
echo "准备数据文件... / Preparing data files..."
today=${{ steps.crawl_step.outputs.crawl_date }}
Expand All @@ -296,7 +363,7 @@ jobs:
echo "数据文件已保存到临时目录 / Data files saved to temporary directory"

- name: Setup and commit to data branch
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
git config --global user.email "${{ vars.EMAIL }}"
git config --global user.name "${{ vars.NAME }}"
Expand Down Expand Up @@ -350,7 +417,7 @@ jobs:
fi

- name: Push data changes to data branch
if: steps.dedup_check.outputs.has_new_content == 'true'
if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
run: |
# 设置Git配置以处理自动合并 / Set Git config for automatic merging
git config pull.rebase true
Expand Down
29 changes: 20 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,29 @@ Otherwise, you can directly use this repo in https://dw-dengwei.github.io/daily-
1. Fork this repo to your own account and delete my own information in [by-me-a-coffee](./buy-me-a-coffee/README.md).
2. Go to: your-own-repo -> Settings -> Secrets and variables -> Actions
3. Go to Secrets. Secrets are encrypted and used for sensitive data
4. Create two repository secrets named `OPENAI_API_KEY` and `OPENAI_BASE_URL`, and input corresponding values.
5. [Optional] Set a password in `secrets.ACCESS_PASSWORD` if you do not wish others to access your page. (see https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/pull/64)
6. Go to Variables. Variables are shown as plain text and are used for non-sensitive data
7. Create the following repository variables:
4. Create repository secrets named `OPENAI_API_KEY` and `OPENAI_BASE_URL`, and input corresponding values.
5. [Optional] Create `GOOGLE_API_KEY` for generating Gemini-based source summaries for author-matched papers.
6. [Optional] Configure SMTP secrets to send daily reports by email:
- `SMTP_HOST`
- `SMTP_USER`
- `SMTP_PASSWORD`
7. [Optional] Set a password in `secrets.ACCESS_PASSWORD` if you do not wish others to access your page. (see https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/pull/64)
8. Go to Variables. Variables are shown as plain text and are used for non-sensitive data
9. Create the following repository variables:
1. `CATEGORIES`: separate the categories with ",", such as "cs.CL, cs.CV"
2. `LANGUAGE`: such as "Chinese" or "English"
3. `MODEL_NAME`: such as "deepseek-chat"
4. `EMAIL`: your email for push to GitHub
5. `NAME`: your name for push to GitHub
8. Go to your-own-repo -> Actions -> arXiv-daily-ai-enhanced
9. You can manually click **Run workflow** to test if it works well (it may take about one hour). By default, this action will automatically run every day. You can modify it in `.github/workflows/run.yml`
10. Set up GitHub pages: Go to your own repo -> Settings -> Pages. In `Build and deployment`, set `Source="Deploy from a branch"`, `Branch="main", "/(root)"`. Wait for a few minutes, go to https://\<username\>.github.io/daily-arXiv-ai-enhanced/. Please see this [issue](https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/issues/14) for more precise instructions.
4. `NOTEBOOKLM_MODEL` (optional): preferred Gemini model, such as "gemini-2.0-flash"
5. `NOTEBOOKLM_FALLBACK_MODELS` (optional): comma-separated fallback models, such as "gemini-2.0-flash,gemini-1.5-flash"
6. `SMTP_PORT` (optional): default `465`
7. `SMTP_FROM` (optional): sender email shown in mailbox, default `SMTP_USER`
8. `SMTP_USE_SSL` (optional): `true` or `false`, default `true`
9. `REPORT_EMAIL_TO` (optional): recipient list separated by comma; default sends to `wangzhuoxuan@seu.edu.cn`
10. `EMAIL`: your email for push to GitHub
11. `NAME`: your name for push to GitHub
10. Go to your-own-repo -> Actions -> arXiv-daily-ai-enhanced
11. You can manually click **Run workflow** to test if it works well (it may take about one hour). By default, this action runs at 09:00 Beijing time every day. You can modify it in `.github/workflows/run.yml`
12. Set up GitHub pages: Go to your own repo -> Settings -> Pages. In `Build and deployment`, set `Source="Deploy from a branch"`, `Branch="main", "/(root)"`. Wait for a few minutes, go to https://\<username\>.github.io/daily-arXiv-ai-enhanced/. Please see this [issue](https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/issues/14) for more precise instructions.

# Plans
See https://github.com/users/dw-dengwei/projects/3
Expand Down
Loading