dw-dengwei · WangZX-SEU · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml
@@ -5,7 +5,7 @@ name: arXiv-daily-ai-enhanced
 
 on:
   schedule:
-    - cron: "30 1 * * *"
+    - cron: "0,10,20,30,40,50 1 * * *"
   workflow_dispatch:
 
 
@@ -20,12 +20,27 @@ jobs:
       run: |
         curl -LsSf https://astral.sh/uv/install.sh | sh
         uv sync
+
+    - name: Schedule guard
+      id: schedule_guard
+      run: |
+        today=$(TZ=Asia/Shanghai date "+%Y-%m-%d")
+        echo "crawl_date=$today" >> $GITHUB_OUTPUT
+        should_run=true
+
+        if [ "${{ github.event_name }}" = "schedule" ] && [ -f "report/${today}.md" ]; then
+          should_run=false
+          echo "今日 report 已存在，跳过重复定时运行 / Today's report exists, skip duplicate scheduled run"
+        fi
+
+        echo "should_run=$should_run" >> $GITHUB_OUTPUT
 
     - name: Crawl arXiv papers
       id: crawl_step
+      if: steps.schedule_guard.outputs.should_run == 'true'
       run: |
         source .venv/bin/activate
-        today=$(date -u "+%Y-%m-%d")
+        today=${{ steps.schedule_guard.outputs.crawl_date }}
         echo "开始爬取 $today 的arXiv论文... / Starting to crawl $today arXiv papers..."
 
         # 检查今日文件是否已存在，如存在则删除 / Check if today's file exists, delete if found
@@ -60,11 +75,13 @@ jobs:
 
     - name: Check for duplicates
       id: dedup_check
+      if: steps.schedule_guard.outputs.should_run == 'true'
       run: |
         source .venv/bin/activate
         echo "执行去重检查... / Performing intelligent deduplication check..."
 
         cd daily_arxiv
+        export CRAWL_DATE="${{ steps.crawl_step.outputs.crawl_date }}"
         # 执行去重检查脚本 / Execute intelligent deduplication check script
         set +e  # 暂时允许命令失败 / Temporarily allow command failure
         python daily_arxiv/check_stats.py
@@ -98,7 +115,7 @@ jobs:
         esac
 
     - name: AI Enhancement Processing
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         source .venv/bin/activate
         today=${{ steps.crawl_step.outputs.crawl_date }}
@@ -120,7 +137,7 @@ jobs:
         echo "AI增强处理完成 / AI enhancement processing completed"
 
     - name: Convert to Markdown
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         source .venv/bin/activate
         today=${{ steps.crawl_step.outputs.crawl_date }}
@@ -149,16 +166,65 @@ jobs:
             exit 1
         fi
         echo "Markdown转换完成 / Markdown conversion completed"
+
+    - name: Generate keyword daily report
+      if: steps.schedule_guard.outputs.should_run == 'true'
+      run: |
+        source .venv/bin/activate
+        today=${{ steps.crawl_step.outputs.crawl_date }}
+        export LANGUAGE="${{ vars.LANGUAGE }}"
+        export GOOGLE_API_KEY="${{ secrets.GOOGLE_API_KEY }}"
+        export NOTEBOOKLM_MODEL="${{ vars.NOTEBOOKLM_MODEL }}"
+        export NOTEBOOKLM_FALLBACK_MODELS="${{ vars.NOTEBOOKLM_FALLBACK_MODELS }}"
+        AI_FILE="data/${today}_AI_enhanced_${LANGUAGE}.jsonl"
+        RAW_FILE="data/${today}.jsonl"
+
+        echo "生成关键词日报... / Generating keyword daily report..."
+        if [ -f "$AI_FILE" ]; then
+          python report/generate_keyword_report.py --input "$AI_FILE" --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
+        elif [ -f "$RAW_FILE" ]; then
+          python report/generate_keyword_report.py --input "$RAW_FILE" --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
+        else
+          echo "未找到数据文件，生成占位日报... / No data file found, generating placeholder report..."
+          python report/generate_keyword_report.py --date "$today" --output-dir report --generate-source-guide --source-guide-dir report/author_source_guides --max-source-papers 30
+        fi
+        echo "关键词日报生成完成 / Keyword daily report generated"
+
+    - name: Send daily report email
+      if: steps.schedule_guard.outputs.should_run == 'true'
+      env:
+        SMTP_HOST: ${{ secrets.SMTP_HOST }}
+        SMTP_PORT: ${{ vars.SMTP_PORT }}
+        SMTP_USER: ${{ secrets.SMTP_USER }}
+        SMTP_PASSWORD: ${{ secrets.SMTP_PASSWORD }}
+        SMTP_FROM: ${{ vars.SMTP_FROM }}
+        REPORT_EMAIL_TO: ${{ vars.REPORT_EMAIL_TO }}
+        SMTP_USE_SSL: ${{ vars.SMTP_USE_SSL }}
+      run: |
+        source .venv/bin/activate
+        today=${{ steps.crawl_step.outputs.crawl_date }}
+
+        if [ -z "$SMTP_HOST" ] || [ -z "$SMTP_USER" ] || [ -z "$SMTP_PASSWORD" ]; then
+          echo "跳过邮件发送：SMTP_HOST/SMTP_USER/SMTP_PASSWORD 未完整配置 / Skip email: SMTP secrets missing"
+          exit 0
+        fi
+
+        python report/send_daily_email.py \
+          --date "$today" \
+          --report-file "report/${today}.md" \
+          --source-file "report/author_source_guides/${today}.md" \
+          --default-to "wangzhuoxuan@seu.edu.cn"
+        echo "邮件发送完成 / Daily email sent"
 
     - name: Update file list
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         echo "更新文件列表... / Updating file list..."
         ls data/*.jsonl | sed 's|data/||' > assets/file-list.txt
         echo "文件列表更新完成 / File list updated"
 
     - name: Generate password hash and inject into config
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         echo "🔐 Generating password hash for authentication..."
 
@@ -192,6 +258,7 @@ jobs:
         echo "🔐 Authentication setup complete"
 
     - name: Summary
+      if: steps.schedule_guard.outputs.should_run == 'true'
       run: |
         if [ "${{ steps.dedup_check.outputs.has_new_content }}" = "true" ]; then
           echo "✅ 工作流完成：去重发现新内容并成功处理 / Workflow completed: Smart deduplication found new content and processed successfully"
@@ -213,7 +280,7 @@ jobs:
         fi
 
     - name: Inject repository info into data-config.js
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         echo "注入仓库信息到 data-config.js... / Injecting repository info into data-config.js..."
 
@@ -235,7 +302,7 @@ jobs:
         fi
 
     - name: Commit code changes to main branch
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true'
       run: |
         git config --global user.email "${{ vars.EMAIL }}"
         git config --global user.name "${{ vars.NAME }}"
@@ -252,12 +319,12 @@ jobs:
         if git diff --staged --quiet; then
           echo "没有代码变更需要提交 / No code changes to commit"
         else
-          git commit -m "chore: update data-config.js with repository info"
+          git commit -m "chore: update workflow config and daily report"
           echo "代码变更已提交到 main 分支 / Code changes committed to main branch"
         fi
 
     - name: Push code changes to main branch
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true'
       run: |
         # 设置Git配置以处理自动合并 / Set Git config for automatic merging
         git config pull.rebase true
@@ -280,7 +347,7 @@ jobs:
         done
 
     - name: Prepare data files for data branch
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         echo "准备数据文件... / Preparing data files..."
         today=${{ steps.crawl_step.outputs.crawl_date }}
@@ -296,7 +363,7 @@ jobs:
         echo "数据文件已保存到临时目录 / Data files saved to temporary directory"
 
     - name: Setup and commit to data branch
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         git config --global user.email "${{ vars.EMAIL }}"
         git config --global user.name "${{ vars.NAME }}"
@@ -350,7 +417,7 @@ jobs:
         fi
 
     - name: Push data changes to data branch
-      if: steps.dedup_check.outputs.has_new_content == 'true'
+      if: steps.schedule_guard.outputs.should_run == 'true' && steps.dedup_check.outputs.has_new_content == 'true'
       run: |
         # 设置Git配置以处理自动合并 / Set Git config for automatic merging
         git config pull.rebase true

diff --git a/README.md b/README.md
@@ -44,18 +44,29 @@ Otherwise, you can directly use this repo in https://dw-dengwei.github.io/daily-
 1. Fork this repo to your own account and delete my own information in [by-me-a-coffee](./buy-me-a-coffee/README.md).
 2. Go to: your-own-repo -> Settings -> Secrets and variables -> Actions
 3. Go to Secrets. Secrets are encrypted and used for sensitive data
-4. Create two repository secrets named `OPENAI_API_KEY` and `OPENAI_BASE_URL`, and input corresponding values.
-5. [Optional] Set a password in `secrets.ACCESS_PASSWORD` if you do not wish others to access your page. (see https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/pull/64)
-6. Go to Variables. Variables are shown as plain text and are used for non-sensitive data
-7. Create the following repository variables:
+4. Create repository secrets named `OPENAI_API_KEY` and `OPENAI_BASE_URL`, and input corresponding values.
+5. [Optional] Create `GOOGLE_API_KEY` for generating Gemini-based source summaries for author-matched papers.
+6. [Optional] Configure SMTP secrets to send daily reports by email:
+   - `SMTP_HOST`
+   - `SMTP_USER`
+   - `SMTP_PASSWORD`
+7. [Optional] Set a password in `secrets.ACCESS_PASSWORD` if you do not wish others to access your page. (see https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/pull/64)
+8. Go to Variables. Variables are shown as plain text and are used for non-sensitive data
+9. Create the following repository variables:
    1. `CATEGORIES`: separate the categories with ",", such as "cs.CL, cs.CV"
    2. `LANGUAGE`: such as "Chinese" or "English"
    3. `MODEL_NAME`: such as "deepseek-chat"
-   4. `EMAIL`: your email for push to GitHub
-   5. `NAME`: your name for push to GitHub
-8. Go to your-own-repo -> Actions -> arXiv-daily-ai-enhanced
-9. You can manually click **Run workflow** to test if it works well (it may take about one hour). By default, this action will automatically run every day. You can modify it in `.github/workflows/run.yml`
-10. Set up GitHub pages: Go to your own repo -> Settings -> Pages. In `Build and deployment`, set `Source="Deploy from a branch"`, `Branch="main", "/(root)"`. Wait for a few minutes, go to https://\<username\>.github.io/daily-arXiv-ai-enhanced/. Please see this [issue](https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/issues/14) for more precise instructions.
+   4. `NOTEBOOKLM_MODEL` (optional): preferred Gemini model, such as "gemini-2.0-flash"
+   5. `NOTEBOOKLM_FALLBACK_MODELS` (optional): comma-separated fallback models, such as "gemini-2.0-flash,gemini-1.5-flash"
+   6. `SMTP_PORT` (optional): default `465`
+   7. `SMTP_FROM` (optional): sender email shown in mailbox, default `SMTP_USER`
+   8. `SMTP_USE_SSL` (optional): `true` or `false`, default `true`
+   9. `REPORT_EMAIL_TO` (optional): recipient list separated by comma; default sends to `wangzhuoxuan@seu.edu.cn`
+   10. `EMAIL`: your email for push to GitHub
+   11. `NAME`: your name for push to GitHub
+10. Go to your-own-repo -> Actions -> arXiv-daily-ai-enhanced
+11. You can manually click **Run workflow** to test if it works well (it may take about one hour). By default, this action runs at 09:00 Beijing time every day. You can modify it in `.github/workflows/run.yml`
+12. Set up GitHub pages: Go to your own repo -> Settings -> Pages. In `Build and deployment`, set `Source="Deploy from a branch"`, `Branch="main", "/(root)"`. Wait for a few minutes, go to https://\<username\>.github.io/daily-arXiv-ai-enhanced/. Please see this [issue](https://github.com/dw-dengwei/daily-arXiv-ai-enhanced/issues/14) for more precise instructions.
 
 # Plans
 See https://github.com/users/dw-dengwei/projects/3