[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-tatsu-lab--gpt_paper_assistant":3,"tool-tatsu-lab--gpt_paper_assistant":61},[4,18,26,36,44,52],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":17},4358,"openclaw","openclaw\u002Fopenclaw","OpenClaw 是一款专为个人打造的本地化 AI 助手，旨在让你在自己的设备上拥有完全可控的智能伙伴。它打破了传统 AI 助手局限于特定网页或应用的束缚，能够直接接入你日常使用的各类通讯渠道，包括微信、WhatsApp、Telegram、Discord、iMessage 等数十种平台。无论你在哪个聊天软件中发送消息，OpenClaw 都能即时响应，甚至支持在 macOS、iOS 和 Android 设备上进行语音交互，并提供实时的画布渲染功能供你操控。\n\n这款工具主要解决了用户对数据隐私、响应速度以及“始终在线”体验的需求。通过将 AI 部署在本地，用户无需依赖云端服务即可享受快速、私密的智能辅助，真正实现了“你的数据，你做主”。其独特的技术亮点在于强大的网关架构，将控制平面与核心助手分离，确保跨平台通信的流畅性与扩展性。\n\nOpenClaw 非常适合希望构建个性化工作流的技术爱好者、开发者，以及注重隐私保护且不愿被单一生态绑定的普通用户。只要具备基础的终端操作能力（支持 macOS、Linux 及 Windows WSL2），即可通过简单的命令行引导完成部署。如果你渴望拥有一个懂你",349277,3,"2026-04-06T06:32:30",[13,14,15,16],"Agent","开发框架","图像","数据工具","ready",{"id":19,"name":20,"github_repo":21,"description_zh":22,"stars":23,"difficulty_score":10,"last_commit_at":24,"category_tags":25,"status":17},3808,"stable-diffusion-webui","AUTOMATIC1111\u002Fstable-diffusion-webui","stable-diffusion-webui 是一个基于 Gradio 构建的网页版操作界面，旨在让用户能够轻松地在本地运行和使用强大的 Stable Diffusion 图像生成模型。它解决了原始模型依赖命令行、操作门槛高且功能分散的痛点，将复杂的 AI 绘图流程整合进一个直观易用的图形化平台。\n\n无论是希望快速上手的普通创作者、需要精细控制画面细节的设计师，还是想要深入探索模型潜力的开发者与研究人员，都能从中获益。其核心亮点在于极高的功能丰富度：不仅支持文生图、图生图、局部重绘（Inpainting）和外绘（Outpainting）等基础模式，还独创了注意力机制调整、提示词矩阵、负向提示词以及“高清修复”等高级功能。此外，它内置了 GFPGAN 和 CodeFormer 等人脸修复工具，支持多种神经网络放大算法，并允许用户通过插件系统无限扩展能力。即使是显存有限的设备，stable-diffusion-webui 也提供了相应的优化选项，让高质量的 AI 艺术创作变得触手可及。",162132,"2026-04-05T11:01:52",[14,15,13],{"id":27,"name":28,"github_repo":29,"description_zh":30,"stars":31,"difficulty_score":32,"last_commit_at":33,"category_tags":34,"status":17},1381,"everything-claude-code","affaan-m\u002Feverything-claude-code","everything-claude-code 是一套专为 AI 编程助手（如 Claude Code、Codex、Cursor 等）打造的高性能优化系统。它不仅仅是一组配置文件，而是一个经过长期实战打磨的完整框架，旨在解决 AI 代理在实际开发中面临的效率低下、记忆丢失、安全隐患及缺乏持续学习能力等核心痛点。\n\n通过引入技能模块化、直觉增强、记忆持久化机制以及内置的安全扫描功能，everything-claude-code 能显著提升 AI 在复杂任务中的表现，帮助开发者构建更稳定、更智能的生产级 AI 代理。其独特的“研究优先”开发理念和针对 Token 消耗的优化策略，使得模型响应更快、成本更低，同时有效防御潜在的攻击向量。\n\n这套工具特别适合软件开发者、AI 研究人员以及希望深度定制 AI 工作流的技术团队使用。无论您是在构建大型代码库，还是需要 AI 协助进行安全审计与自动化测试，everything-claude-code 都能提供强大的底层支持。作为一个曾荣获 Anthropic 黑客大奖的开源项目，它融合了多语言支持与丰富的实战钩子（hooks），让 AI 真正成长为懂上",141543,2,"2026-04-06T11:32:54",[14,13,35],"语言模型",{"id":37,"name":38,"github_repo":39,"description_zh":40,"stars":41,"difficulty_score":32,"last_commit_at":42,"category_tags":43,"status":17},2271,"ComfyUI","Comfy-Org\u002FComfyUI","ComfyUI 是一款功能强大且高度模块化的视觉 AI 引擎，专为设计和执行复杂的 Stable Diffusion 图像生成流程而打造。它摒弃了传统的代码编写模式，采用直观的节点式流程图界面，让用户通过连接不同的功能模块即可构建个性化的生成管线。\n\n这一设计巧妙解决了高级 AI 绘图工作流配置复杂、灵活性不足的痛点。用户无需具备编程背景，也能自由组合模型、调整参数并实时预览效果，轻松实现从基础文生图到多步骤高清修复等各类复杂任务。ComfyUI 拥有极佳的兼容性，不仅支持 Windows、macOS 和 Linux 全平台，还广泛适配 NVIDIA、AMD、Intel 及苹果 Silicon 等多种硬件架构，并率先支持 SDXL、Flux、SD3 等前沿模型。\n\n无论是希望深入探索算法潜力的研究人员和开发者，还是追求极致创作自由度的设计师与资深 AI 绘画爱好者，ComfyUI 都能提供强大的支持。其独特的模块化架构允许社区不断扩展新功能，使其成为当前最灵活、生态最丰富的开源扩散模型工具之一，帮助用户将创意高效转化为现实。",107888,"2026-04-06T11:32:50",[14,15,13],{"id":45,"name":46,"github_repo":47,"description_zh":48,"stars":49,"difficulty_score":10,"last_commit_at":50,"category_tags":51,"status":17},4487,"LLMs-from-scratch","rasbt\u002FLLMs-from-scratch","LLMs-from-scratch 是一个基于 PyTorch 的开源教育项目，旨在引导用户从零开始一步步构建一个类似 ChatGPT 的大型语言模型（LLM）。它不仅是同名技术著作的官方代码库，更提供了一套完整的实践方案，涵盖模型开发、预训练及微调的全过程。\n\n该项目主要解决了大模型领域“黑盒化”的学习痛点。许多开发者虽能调用现成模型，却难以深入理解其内部架构与训练机制。通过亲手编写每一行核心代码，用户能够透彻掌握 Transformer 架构、注意力机制等关键原理，从而真正理解大模型是如何“思考”的。此外，项目还包含了加载大型预训练权重进行微调的代码，帮助用户将理论知识延伸至实际应用。\n\nLLMs-from-scratch 特别适合希望深入底层原理的 AI 开发者、研究人员以及计算机专业的学生。对于不满足于仅使用 API，而是渴望探究模型构建细节的技术人员而言，这是极佳的学习资源。其独特的技术亮点在于“循序渐进”的教学设计：将复杂的系统工程拆解为清晰的步骤，配合详细的图表与示例，让构建一个虽小但功能完备的大模型变得触手可及。无论你是想夯实理论基础，还是为未来研发更大规模的模型做准备",90106,"2026-04-06T11:19:32",[35,15,13,14],{"id":53,"name":54,"github_repo":55,"description_zh":56,"stars":57,"difficulty_score":10,"last_commit_at":58,"category_tags":59,"status":17},4292,"Deep-Live-Cam","hacksider\u002FDeep-Live-Cam","Deep-Live-Cam 是一款专注于实时换脸与视频生成的开源工具，用户仅需一张静态照片，即可通过“一键操作”实现摄像头画面的即时变脸或制作深度伪造视频。它有效解决了传统换脸技术流程繁琐、对硬件配置要求极高以及难以实时预览的痛点，让高质量的数字内容创作变得触手可及。\n\n这款工具不仅适合开发者和技术研究人员探索算法边界，更因其极简的操作逻辑（仅需三步：选脸、选摄像头、启动），广泛适用于普通用户、内容创作者、设计师及直播主播。无论是为了动画角色定制、服装展示模特替换，还是制作趣味短视频和直播互动，Deep-Live-Cam 都能提供流畅的支持。\n\n其核心技术亮点在于强大的实时处理能力，支持口型遮罩（Mouth Mask）以保留使用者原始的嘴部动作，确保表情自然精准；同时具备“人脸映射”功能，可同时对画面中的多个主体应用不同面孔。此外，项目内置了严格的内容安全过滤机制，自动拦截涉及裸露、暴力等不当素材，并倡导用户在获得授权及明确标注的前提下合规使用，体现了技术发展与伦理责任的平衡。",88924,"2026-04-06T03:28:53",[14,15,13,60],"视频",{"id":62,"github_repo":63,"name":64,"description_en":65,"description_zh":66,"ai_summary_zh":67,"readme_en":68,"readme_zh":69,"quickstart_zh":70,"use_case_zh":71,"hero_image_url":72,"owner_login":73,"owner_name":74,"owner_avatar_url":75,"owner_bio":76,"owner_company":77,"owner_location":77,"owner_email":77,"owner_twitter":77,"owner_website":77,"owner_url":78,"languages":79,"stars":84,"forks":85,"last_commit_at":86,"license":87,"difficulty_score":32,"env_os":88,"env_gpu":89,"env_ram":90,"env_deps":91,"category_tags":97,"github_topics":99,"view_count":32,"oss_zip_url":77,"oss_zip_packed_at":77,"status":17,"created_at":103,"updated_at":104,"faqs":105,"releases":136},4555,"tatsu-lab\u002Fgpt_paper_assistant","gpt_paper_assistant","GPT4 based personalized ArXiv paper assistant bot","gpt_paper_assistant 是一款基于 GPT-4 打造的个性化 ArXiv 论文助手，旨在帮助科研人员从海量学术文献中高效筛选出真正感兴趣的内容。面对每日激增的预印本论文，研究者往往难以及时捕捉与自己研究方向或关注学者相关的最新成果。这款工具通过每日自动扫描 ArXiv，结合用户自定义的研究主题和特定作者列表，利用大模型智能过滤并生成精选论文日报。\n\n它特别适合人工智能领域的研究人员、博士生以及需要紧跟前沿技术的开发者使用。用户只需简单配置关注的领域（如 cs.CL）和目标作者，gpt_paper_assistant 即可通过 GitHub Actions 定时运行，将结果以静态网页形式发布，或直接推送到 Slack 频道，实现信息的无缝流转。\n\n其技术亮点在于巧妙融合了语义学者（Semantic Scholar）的作者 ID 匹配机制与 GPT-4 的语义理解能力，不仅能精准追踪特定大牛的最新动态，还能根据标题和摘要的相关性进行深度筛选。此外，项目在设计上充分考虑了成本效益，通过标题预过滤等策略，使得全量扫描特定领域的日均 API 成本极低（例如扫描 cs.CL 类别仅","gpt_paper_assistant 是一款基于 GPT-4 打造的个性化 ArXiv 论文助手，旨在帮助科研人员从海量学术文献中高效筛选出真正感兴趣的内容。面对每日激增的预印本论文，研究者往往难以及时捕捉与自己研究方向或关注学者相关的最新成果。这款工具通过每日自动扫描 ArXiv，结合用户自定义的研究主题和特定作者列表，利用大模型智能过滤并生成精选论文日报。\n\n它特别适合人工智能领域的研究人员、博士生以及需要紧跟前沿技术的开发者使用。用户只需简单配置关注的领域（如 cs.CL）和目标作者，gpt_paper_assistant 即可通过 GitHub Actions 定时运行，将结果以静态网页形式发布，或直接推送到 Slack 频道，实现信息的无缝流转。\n\n其技术亮点在于巧妙融合了语义学者（Semantic Scholar）的作者 ID 匹配机制与 GPT-4 的语义理解能力，不仅能精准追踪特定大牛的最新动态，还能根据标题和摘要的相关性进行深度筛选。此外，项目在设计上充分考虑了成本效益，通过标题预过滤等策略，使得全量扫描特定领域的日均 API 成本极低（例如扫描 cs.CL 类别仅需约 0.07 美元），让个性化的学术资讯订阅变得经济且易于部署。","# GPT4 paper assistant: A daily ArXiv scanner\n\nThis repo implements a very simple daily scanner for Arxiv that uses GPT4 and author matches to find papers you might find interesting. \nIt will run daily via github actions and can post this information to slack via a bot or just render it in a static github-pages website.\n\nA simple demo of the daily papers can be seen [here](https:\u002F\u002Ftatsu-lab.github.io\u002Fgpt_paper_assistant\u002F) running on `cs.CL`\n\nAs a cost estimate, running this on all of `cs.CL` cost $0.07 on 2\u002F7\u002F2024\n\n## Changelog\n- **2\u002F15\u002F2024**: fixed a bug with author parsing in the RSS format + cost estimates for title filtering being off + crash when 0 papers are on the feed. \n- **2\u002F7\u002F2024**: fixed a critical issue from ArXiv changing their RSS format. Added and enabled a title filtering to reduce costs.\n\n\n## Quickstart\nThis is the minimal necessary steps to get the scanner to run. It is highly recommended to read the whole thing to decide what you want to run.\n\n### Running on github actions\n\n1. Copy\u002Ffork this repo to a new github repo and [enable scheduled workflows](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fusing-workflows\u002Fdisabling-and-enabling-a-workflow) if you fork it.\n2. Copy `config\u002Fpaper_topics.template.txt` to `config\u002Fpaper_topics.txt` and fill it out with the types of papers you want to follow\n3. Copy `config\u002Fauthors.template.txt` to `config\u002Fauthors.txt` and list the authors you actually want to follow. The numbers behind the author are important. They are semantic scholar author IDs which you can find by looking up the authors on semantic scholar and taking the numbers at the end of the URL.\n4. Set your desired ArXiv categories in `config\u002Fconfig.ini`.\n5. Set your openai key (`OAI_KEY`) as ``a [github secret](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fsecurity-guides\u002Fusing-secrets-in-github-actions#creating-secrets-for-a-repository)\n6. In your repo settings, set github page build sources to be [github actions](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fpages\u002Fgetting-started-with-github-pages\u002Fconfiguring-a-publishing-source-for-your-github-pages-site#publishing-with-a-custom-github-actions-workflow)\n\nAt this point your bot should run daily and publish a static website. You can test this by running the github action workflow manually.\n\n**Optional but highly recommended**: \n\n7. Get and set up a semantic scholar API key (`S2_KEY`) as a github secret. Otherwise the author search step will be very slow\n8. [Set up a slack bot](https:\u002F\u002Fapi.slack.com\u002Fstart\u002Fquickstart), get the OAuth key, set it to `SLACK_KEY` as a github secret\n9. Make a channel for the bot (and invite it to the channel), get its [Slack channel id](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F40940327\u002Fwhat-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id), set it as `SLACK_CHANNEL_ID` in a github secret.\n10. Take a look at `configs\u002Fconfig.ini` to tweak how things are filtered.\n11. Set the github repo private to avoid github actions being [set to inactive after 60 days](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fusing-workflows\u002Fdisabling-and-enabling-a-workflow)\n\nEach day at 1pm UTC, the bot will run and post to slack and publish a github pages website (see the publish_md and cron_runs actions for details).\n\n### Running locally\n\nThe steps are generally the same as above, but you have to set up the environment via `requirements.txt`\n\nInstead of passing credentials via github secrets, you have to set environment variables `OAI_KEY`, `SLACK_KEY`, `SLACK_CHANNEL_ID`.\n\nTo run everything, just call `main.py`\n\n**Other notes:**\nYou may also want to not push to slack, in which case set your desired output endpoint (json, markdown, slack) in the `dump_json`, `dump_md`, and `push_to_slack` fields of `config\u002Fconfig.ini`.\n\nIf the semantic scholar API times out or is slow, you should get a [S2 api key](https:\u002F\u002Fwww.semanticscholar.org\u002Fproduct\u002Fapi#api-key-form) and set it as `S2_KEY` in your environment variables.\n(due to the limitations of github actions, this will only help if the code is run locally)\n\n**Making it run on its own:**\nThis whole thing takes almost no compute, so you can rent the cheapest VM from AWS, put this repo in it, install the `requirements.txt`\nappropriately set up the environment variables and add the following crontab\n```\n0 13 * * * python ~\u002Farxiv_scanner\u002Fmain.py\n```\nThis crontab will run the script every 1pm UTC, 6pm pacific. \n\n## Making the `paper_topics.txt` prompt\nThe `paper_topics.txt` file is used to generate the prompt for GPT. It is a list of topics that you want to follow.\nOne set of examples might be something like \n```text\n 1. New methodological improvements to RLHF or instruction-following which are specific fine-tuning steps that are taken to make language models better at following user instructions across a range of tasks.\n    - Relevant: papers that discuss specific methods like RLHF, or instruction-tuning datasets, improving these methods, or analyzing them.\n    - Not relevant: papers about adaptation to some task. Simply following instructions or inputs are not sufficient.\n 2. Shows new powerful test set contamination or membership inference methods for language models. Test set contamination is the phenomenon where a language model observes a benchmark dataset during pretraining.\n    - Relevant: test statistics that can detect contamination of benchmarks in language models. statistics that can provide guarantees are more interesting. membership inference methods that are general enough to apply to language models are also relevant.\n    - Not relevant: any papers that do not consider language models, or that do not consider test set contamination.\n 3. Shows a significant advance in the performance of diffusion language models.\n    - Relevant: papers that study language models that are also diffusion models. Continuous diffusions are even more relevant, while discrete diffusions are less so.\n    - Not relevant: papers about image diffusions like DALL-E or Stable Diffusion, or papers that do not explicitly mention language models or applications to text.\n```\nThis is just a standard prompt, but being very specific can help, especially for things like 'diffusion language models' or 'instruction-following', where the LM can get confused about whether image diffusions are relevant, or if doing some task better is sufficient to improve instruction following.\n\nYou may also want to follow this with some general interest areas like\n```text\nIn suggesting papers to your friend, remember that he enjoys papers on statistical machine learning, and generative modeling in natural language processing.\n Your friend also likes learning about surprising empirical results in language models, as well as clever statistical tricks.\n He does not want to read papers that are about primarily applications of methods to specific domains.\n```\n\n## Details of how it works\n\nThe script grabs a candidate set of ArXiv papers for a specific day, via the RSS feeds. To avoid double-announcing papers, it will only grab an RSS feed within the last day. To avoid missing papers, you'd want to run this every day. \nIt filters out any `UPDATED` papers and announces only new ones.\n\nThe filtering logic is pretty simple. We first check for author match.\n1. Do a lookup of the authors on semantic scholar, getting a list of candidate matches.\n2. Check the authors of the paper. If the author semantic scholar id matches someone in `authors.txt` it goes in the candidate set with a default score of `author_match_score`.\n\nWe then check for GPT-evaluated relevance. We do this in two steps.\n1. Filter out any papers that have no authors with h-index above `hcutoff` in `config.ini`. This is to reduce costs.\n2. All remaining examples get batched, and are evaluated by a GPT model specified by `model` in `config.ini`. **You should only use GPT3.5 for debugging. It does not work well for this purpose!**\nThis step uses the following prompt setup defined in `configs\u002F`\n\n>You are a helpful paper reading assistant whose job is to read daily posts from ArXiv and identify a few papers that might be relevant for your friend. There will be up to 5 papers below. Your job is to find papers that:\n> 1. Criterion 1\n> 2. Criterion 2\n> \n> [PAPERS]\n> \n> Write the response in JSONL format with {ARXIVID, COMMENT, RELEVANCE, NOVELTY} on each line, one for each paper.\nThe ARXIVID should be the ArXiv ID.\nThe COMMENT should identify whether there is a criteria that match the paper very closely. If so, it should mention it by number (no need to mention the non-matching criteria).\nThese matches should not be based on general terms like \"language modeling\" or \"advancements\" and should specifically refer to a criterion.\nThe RELEVANCE should be a relevance score from 1-10 where 10 must be directly related to the exact, specific criterion with near-synonym keyword matches and authors who are known for working on the topic, 1 is irrelevant to any criterion, and unrelated to your friend's general interest area, 2-3 is papers that are relevant to the general interest area, but not specific criteria, and 5 is a direct match to a specific criterion.\nThe NOVELTY should be a score from 1 to 10, where 10 is a groundbreaking, general-purpose discovery that would transform the entire field and 1 is work that improves one aspect of a problem or is an application to a very specific field. Read the abstract carefully to determine this and assume that authors cannot be trusted in their claims of novelty.\n\n3. GPT scores the papers for relevance (to the topics in `config\u002Fpapers_topics.txt`) and novelty (scale 1-10)\n4. Papers are filtered if they have scores below either the relevance and novelty cutoffs in `config.ini`\n5. Papers are given an overall score based on equal weight to relevance and novelty\n\nFinally, all papers are sorted by the max of their `author_match_score` and the sum of the GPT-rated relevance and novelty scores (the relevance and novelty scores will only show up in the final output if they are above the cutoff thresholds you set in the config file). Then the papers are rendered and pushed into their endpoints (text files or Slack).\n\n## Contributing \nThis repo uses ruff - `ruff check .` and `ruff format .` \nPlease install the pre-commit hook by running `pre-commit install`\n\n### Testing and improving the GPT filter\nThe `filter_papers.py` code can also be run as a standalone script.\nThis will take a batch of papers in `in\u002Fdebug_papers.json`, run whatever config and prompts you have\nand return an output to `out\u002Ffilter_paper_test.debug.json`. If you find the bot makes mistakes, you can find the associated batch in `out\u002Fgpt_paper_batches.debug.json` and copy that into the relevant `debug_papers` file.\n\nThis lets you build a benchmark for the filter and to see what comes out on the other side.\n\n## Other stuff\nThis repo and code was originally built by Tatsunori Hashimoto is licensed under the Apache 2.0 license.\nThanks to Chenglei Si for testing and benchmarking the GPT filter.\n","# GPT4论文助手：每日ArXiv扫描器\n\n这个仓库实现了一个非常简单的ArXiv每日扫描工具，它利用GPT4和作者匹配功能来寻找你可能感兴趣的论文。该工具会通过GitHub Actions每天自动运行，并可以通过机器人将信息发布到Slack，或者直接在静态的GitHub Pages网站上展示。\n\n一个简单的每日论文演示可以在[这里](https:\u002F\u002Ftatsu-lab.github.io\u002Fgpt_paper_assistant\u002F)查看，当前运行在`cs.CL`类别下。\n\n作为成本估算，在2024年2月7日，对整个`cs.CL`类别运行此工具的成本为0.07美元。\n\n## 更改记录\n- **2024年2月15日**：修复了RSS格式中作者解析的错误，以及标题过滤导致成本估算不准确的问题；同时修复了当源中没有论文时程序崩溃的故障。\n- **2024年2月7日**：修复了由于ArXiv更改其RSS格式而引发的关键问题，并添加并启用了标题过滤以降低成本。\n\n\n## 快速入门\n以下是让扫描器运行所需的最少步骤。强烈建议阅读完整文档，以便决定如何配置和运行。\n\n### 在GitHub Actions上运行\n\n1. 复制或分叉本仓库到一个新的GitHub仓库，如果你是分叉的话，请[启用计划工作流](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fusing-workflows\u002Fdisabling-and-enabling-a-workflow)。\n2. 将`config\u002Fpaper_topics.template.txt`复制到`config\u002Fpaper_topics.txt`，并填写你希望关注的论文类型。\n3. 将`config\u002Fauthors.template.txt`复制到`config\u002Fauthors.txt`，列出你真正想关注的作者。作者后面的数字非常重要，它们是Semantic Scholar的作者ID，你可以在Semantic Scholar上搜索作者后，从URL末尾获取这些数字。\n4. 在`config\u002Fconfig.ini`中设置你期望的ArXiv类别。\n5. 将你的OpenAI密钥（`OAI_KEY`）作为[GitHub秘密变量](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fsecurity-guides\u002Fusing-secrets-in-github-actions#creating-secrets-for-a-repository)进行设置。\n6. 在你的仓库设置中，将GitHub Pages的构建源设置为[GitHub Actions](https:\u002F\u002Fdocs.github.com\u002Fen\u002Fpages\u002Fgetting-started-with-github-pages\u002Fconfiguring-a-publishing-source-for-your-github-pages-site#publishing-with-a-custom-github-actions-workflow)。\n\n至此，你的机器人应该会每天运行，并发布一个静态网站。你可以手动运行GitHub Actions工作流来测试这一点。\n\n**可选但强烈推荐**：\n\n7. 获取并设置Semantic Scholar API密钥（`S2_KEY`）作为GitHub秘密变量。否则，作者搜索步骤将会非常缓慢。\n8. [设置一个Slack机器人](https:\u002F\u002Fapi.slack.com\u002Fstart\u002Fquickstart)，获取OAuth密钥，并将其设置为GitHub秘密变量`SLACK_KEY`。\n9. 为机器人创建一个频道（并邀请它加入该频道），获取其[Slack频道ID](https:\u002F\u002Fstackoverflow.com\u002Fquestions\u002F40940327\u002Fwhat-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id)，并将其设置为GitHub秘密变量`SLACK_CHANNEL_ID`。\n10. 查看`configs\u002Fconfig.ini`文件，调整过滤规则。\n11. 将GitHub仓库设为私有，以避免GitHub Actions在60天后被[自动禁用](https:\u002F\u002Fdocs.github.com\u002Fen\u002Factions\u002Fusing-workflows\u002Fdisabling-and-enabling-a-workflow)。\n\n每天UTC时间下午1点，机器人将会运行，向Slack发送消息，并发布GitHub Pages网站（详情请参阅`publish_md`和`cron_runs`工作流）。\n\n### 本地运行\n\n步骤与上述大致相同，但你需要通过`requirements.txt`来设置环境。\n\n不同于通过GitHub秘密变量传递凭证，你需要设置环境变量`OAI_KEY`、`SLACK_KEY`和`SLACK_CHANNEL_ID`。\n\n要运行所有内容，只需调用`main.py`即可。\n\n**其他注意事项：**\n你也可以选择不推送至Slack，此时可在`config\u002Fconfig.ini`中的`dump_json`、`dump_md`和`push_to_slack`字段中设置你期望的输出端点（JSON、Markdown或Slack）。\n\n如果Semantic Scholar API出现超时或响应缓慢的情况，你应该获取一个[S2 API密钥](https:\u002F\u002Fwww.semanticscholar.org\u002Fproduct\u002Fapi#api-key-form)，并将其设置为环境变量`S2_KEY`。（由于GitHub Actions的限制，只有在本地运行代码时才能起到效果）\n\n**使其独立运行：**\n整个系统几乎不需要计算资源，因此你可以租用AWS上最便宜的虚拟机，将此仓库部署到其中，并安装`requirements.txt`中的依赖项。然后正确设置环境变量，并添加以下crontab：\n```\n0 13 * * * python ~\u002Farxiv_scanner\u002Fmain.py\n```\n这条crontab将在每天UTC下午1点（太平洋时间下午6点）运行脚本。\n\n## 如何编写`paper_topics.txt`提示\n`paper_topics.txt`文件用于生成GPT的提示语，它是一个你希望关注的主题列表。\n一组示例可能如下所示：\n```text\n1. RLHF或指令跟随方面的新型方法论改进，即为了提升语言模型在各类任务中更好地遵循用户指令而采取的具体微调步骤。\n    - 相关：讨论RLHF等具体方法、指令微调数据集、改进这些方法或对其进行分析的论文。\n    - 不相关：仅涉及某种任务适应的论文。单纯地遵循指令或输入并不足以说明问题。\n2. 展示针对语言模型的新型强大测试集污染检测或成员推理方法。测试集污染是指语言模型在预训练过程中接触到基准数据集的现象。\n    - 相关：能够检测语言模型中基准数据集污染的统计指标，尤其是能提供可靠保证的统计方法；同样相关的还有适用于语言模型的通用成员推理方法。\n    - 不相关：任何不涉及语言模型，或未考虑测试集污染的论文。\n3. 显示扩散式语言模型性能的重大突破。\n    - 相关：研究既是扩散模型又是语言模型的论文。连续扩散模型更为重要，而离散扩散模型则相对次要。\n    - 不相关：关于DALL-E或Stable Diffusion等图像扩散模型的论文，以及未明确提及语言模型或文本应用的论文。\n```\n这只是一个标准的提示语，但越具体越好，尤其是在“扩散式语言模型”或“指令跟随”这类主题上，因为语言模型可能会混淆图像扩散是否相关，或者仅仅提高某项任务的表现是否足以改善指令跟随能力。\n\n你还可以在后面补充一些更广泛的兴趣领域，例如：\n```text\n在为你朋友推荐论文时，请记住他喜欢统计机器学习和自然语言处理中的生成建模方面的论文。此外，他也乐于了解语言模型中令人惊讶的实证结果，以及巧妙的统计技巧。但他并不希望阅读那些主要关注方法在特定领域应用的论文。\n```\n\n## 工作原理详解\n\n该脚本通过 RSS 订阅源抓取特定日期的 ArXiv 论文候选集。为避免重复推送论文，它只会获取过去一天内的 RSS 源。为了不遗漏任何论文，建议每天运行一次。\n\n脚本会过滤掉所有标记为“已更新”的论文，仅推送新的论文。\n\n过滤逻辑非常简单：首先检查作者是否匹配。\n1. 在 Semantic Scholar 上查询作者，获取可能的匹配列表。\n2. 检查论文中的作者。如果某位作者的 Semantic Scholar ID 与 `authors.txt` 文件中的某个人匹配，则该论文会被加入候选集，并赋予默认分数 `author_match_score`。\n\n接下来，脚本会根据 GPT 评估的相关性进行筛选。这一过程分为两步：\n1. 过滤掉所有没有 h-index 高于 `config.ini` 中 `hcutoff` 值的作者的论文，以降低成本。\n2. 将剩余的论文分批提交给 `config.ini` 中指定的 GPT 模型进行评估。**调试时应仅使用 GPT-3.5，因为它在此任务中效果不佳！**\n此步骤采用 `configs\u002F` 中定义的以下提示模板：\n\n> 你是一位贴心的论文阅读助手，负责每日浏览 ArXiv 的最新帖子，为你的朋友挑选几篇可能相关的论文。以下是最多五篇论文，请从中找出符合以下条件的论文：\n> 1. 条件一\n> 2. 条件二\n> \n> [论文列表]\n> \n> 请以 JSONL 格式在每行中输出 {ARXIVID, COMMENT, RELEVANCE, NOVELTY}，每篇论文对应一行。\n> ARXIVID 应为论文的 ArXiv ID。\n> COMMENT 应指明是否有某条条件与论文高度契合；若有，需明确指出具体是哪一条（无需提及未匹配的条件）。\n> 匹配依据不应是“语言建模”或“进展”等宽泛术语，而应具体指向某一明确条件。\n> RELEVANCE 是相关性评分，范围为 1 到 10：10 分表示论文与特定条件完全吻合，关键词近似且作者在该领域享有盛名；1 分表示与任何条件均无关，也与你朋友的兴趣领域毫不相干；2–3 分表示论文虽与兴趣领域相关，但未满足具体条件；5 分则表示与某一特定条件直接匹配。\n> NOVELTY 是创新性评分，范围为 1 到 10：10 分代表具有突破性的通用发现，将彻底改变整个领域；1 分则表示仅改进了问题的某个方面，或应用于非常特定的子领域。请仔细阅读摘要来判断，并假设作者关于创新性的声明不可信。\n\n3. GPT 会对论文的相关性和创新性分别打分（评分范围均为 1–10）。\n4. 如果论文的相关性或创新性得分低于 `config.ini` 中设定的阈值，则将其过滤掉。\n5. 最后，根据相关性和创新性得分各占一半的权重，计算每篇论文的综合得分。\n\n最终，所有论文将按照其 `author_match_score` 与 GPT 评定的相关性和创新性得分之和的最大值进行排序（相关性和创新性得分仅在高于配置文件中设定的阈值时才会出现在最终输出中）。随后，这些论文会被渲染并推送到各自的端点（文本文件或 Slack）。\n\n## 贡献说明\n本仓库使用 ruff 工具：`ruff check .` 和 `ruff format .`。\n请通过运行 `pre-commit install` 安装预提交钩子。\n\n### 测试与优化 GPT 过滤器\n`filter_papers.py` 脚本也可以作为独立脚本运行。\n它会从 `in\u002Fdebug_papers.json` 中读取一批论文，按照当前的配置和提示模板进行处理，并将结果输出到 `out\u002Ffilter_paper_test.debug.json`。如果发现机器人存在误判，可以找到对应的批次文件 `out\u002Fgpt_paper_batches.debug.json`，将其复制到相应的 `debug_papers` 文件中。\n\n这样可以帮助你构建过滤器的基准测试，并查看最终的输出结果。\n\n## 其他信息\n本仓库及代码最初由 Tatsunori Hashimoto 开发，采用 Apache 2.0 许可证授权。感谢 Chenglei Si 对 GPT 过滤器的测试与基准测试工作。","# gpt_paper_assistant 快速上手指南\n\n`gpt_paper_assistant` 是一个基于 GPT-4 的每日 ArXiv 论文扫描工具。它能根据你设定的研究主题和关注的作者，自动筛选出你可能感兴趣的最新论文，并通过 GitHub Pages 静态网站或 Slack 机器人推送结果。\n\n## 环境准备\n\n在开始之前，请确保满足以下要求：\n\n*   **操作系统**：Linux, macOS 或 Windows (需配置 Python 环境)\n*   **Python 版本**：推荐 Python 3.8+\n*   **必备账号与密钥**：\n    *   **GitHub 账号**：用于托管代码、运行定时任务 (GitHub Actions) 及发布页面。\n    *   **OpenAI API Key**：用于调用 GPT 模型进行论文相关性评估 (`OAI_KEY`)。\n    *   **Semantic Scholar API Key** (可选但强烈推荐)：用于加速作者匹配过程 (`S2_KEY`)。若无此密钥，本地运行时作者搜索会变慢。\n    *   **Slack Bot Token & Channel ID** (可选)：若需通过 Slack 接收推送，需创建 Slack 应用并获取 `SLACK_KEY` 和 `SLACK_CHANNEL_ID`。\n\n## 安装步骤\n\n### 1. 部署代码库\n\n将项目 Fork 到你自己的 GitHub 仓库，或克隆到本地：\n\n```bash\ngit clone https:\u002F\u002Fgithub.com\u002F\u003C你的用户名>\u002Fgpt_paper_assistant.git\ncd gpt_paper_assistant\n```\n\n### 2. 配置运行环境\n\n#### 方案 A：使用 GitHub Actions (推荐，全自动运行)\n\n此方案利用 GitHub 的免费算力每天自动运行扫描任务。\n\n1.  **启用工作流**：如果是 Fork 的仓库，请在 GitHub 页面点击 \"Actions\" 标签页并启用 workflows。\n2.  **配置关注主题**：\n    ```bash\n    cp config\u002Fpaper_topics.template.txt config\u002Fpaper_topics.txt\n    ```\n    编辑 `config\u002Fpaper_topics.txt`，填入你感兴趣的具体研究方向（提示词格式参考下文“基本使用”）。\n3.  **配置关注作者**：\n    ```bash\n    cp config\u002Fauthors.template.txt config\u002Fauthors.txt\n    ```\n    编辑 `config\u002Fauthors.txt`，列出作者姓名及其 Semantic Scholar ID（ID 为该作者 Semantic Scholar 主页 URL 末尾的数字）。\n4.  **设置 ArXiv 分类**：\n    编辑 `config\u002Fconfig.ini`，在 `arxiv_categories` 中设定需要扫描的分类（如 `cs.CL`）。\n5.  **设置 GitHub Secrets**：\n    进入仓库 **Settings -> Secrets and variables -> Actions**，添加以下 secrets：\n    *   `OAI_KEY`: 你的 OpenAI API Key。\n    *   `S2_KEY`: 你的 Semantic Scholar API Key (可选)。\n    *   `SLACK_KEY`: Slack Bot OAuth Token (可选)。\n    *   `SLACK_CHANNEL_ID`: Slack 频道 ID (可选)。\n6.  **配置 GitHub Pages**：\n    进入仓库 **Settings -> Pages**，将 \"Build and deployment\" 的 Source 设置为 **GitHub Actions**。\n\n#### 方案 B：本地运行 (适合调试)\n\n1.  **安装依赖**：\n    ```bash\n    pip install -r requirements.txt\n    # 国内用户可使用清华源加速\n    # pip install -r requirements.txt -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple\n    ```\n2.  **配置环境变量**：\n    在终端中导出必要的密钥（Windows PowerShell 使用 `$env:VAR_NAME=\"value\"`）：\n    ```bash\n    export OAI_KEY=\"your_openai_key\"\n    export S2_KEY=\"your_semantic_scholar_key\"\n    export SLACK_KEY=\"your_slack_token\"\n    export SLACK_CHANNEL_ID=\"your_channel_id\"\n    ```\n3.  **完成配置文件**：\n    同方案 A，需手动创建并编辑 `config\u002Fpaper_topics.txt`, `config\u002Fauthors.txt` 和 `config\u002Fconfig.ini`。\n\n## 基本使用\n\n### 1. 定义筛选提示词 (Prompt)\n\n工具的核心在于 `config\u002Fpaper_topics.txt`。你需要清晰地描述你关注的领域，区分“相关”与“不相关”的情况，以避免 GPT 产生幻觉。\n\n**示例 (`config\u002Fpaper_topics.txt`)：**\n\n```text\n1. 针对 RLHF 或指令遵循的新方法论改进，特别是使语言模型在多任务中更好遵循用户指令的微调步骤。\n   - 相关：讨论具体方法如 RLHF、指令微调数据集、改进这些方法或对其进行分析的论文。\n   - 不相关：仅关于适应特定任务的论文。仅仅遵循指令或输入是不够的。\n\n2. 展示新的强力测试集污染检测方法或成员推断方法。测试集污染是指语言模型在预训练期间观察到基准数据集的现象。\n   - 相关：能检测语言模型基准污染的统计方法；提供保证的统计方法更有趣；适用于语言模型的通用成员推断方法。\n   - 不相关：不考虑语言模型或不考虑测试集污染的论文。\n\n3. 展示扩散语言模型性能的显著进步。\n   - 相关：研究同时也是扩散模型的语言模型的论文。连续扩散更相关，离散扩散次之。\n   - 不相关：关于图像扩散（如 DALL-E, Stable Diffusion）的论文，或未明确提及语言模型及文本应用的论文。\n\n在向朋友推荐论文时，请记住他喜欢统计机器学习和自然语言处理中的生成模型方面的论文。\n他也喜欢了解语言模型中令人惊讶的实证结果以及巧妙的统计技巧。\n他不想阅读主要关于将方法应用于特定领域的论文。\n```\n\n### 2. 运行与验证\n\n#### 在 GitHub Actions 上运行\n配置完成后，工作流默认会在每天 **UTC 时间 13:00** (北京时间 21:00) 自动运行。\n*   **手动测试**：进入仓库 \"Actions\" 标签页，选择对应 workflow，点击 \"Run workflow\" 立即触发一次运行。\n*   **查看结果**：运行成功后，结果将发布在 GitHub Pages 网站上（地址通常为 `https:\u002F\u002F\u003C你的用户名>.github.io\u002Fgpt_paper_assistant\u002F`），若配置了 Slack，也会收到推送消息。\n\n#### 在本地运行\n执行主脚本即可启动单次扫描：\n\n```bash\npython main.py\n```\n\n输出文件将生成在项目目录中（取决于 `config\u002Fconfig.ini` 中的 `dump_json`, `dump_md` 等配置），若配置了 Slack 且网络通畅，也会发送通知。\n\n### 3. 调试 GPT 过滤器\n\n如果发现筛选结果不准确，可以单独测试过滤逻辑：\n\n1.  准备一批测试论文数据放入 `in\u002Fdebug_papers.json`。\n2.  运行过滤脚本：\n    ```bash\n    python filter_papers.py\n    ```\n3.  查看输出结果 `out\u002Ffilter_paper_test.debug.json`，根据结果调整 `config\u002Fpaper_topics.txt` 中的提示词或 `config\u002Fconfig.ini` 中的阈值。","某高校自然语言处理实验室的博士生李明，正致力于跟踪大模型推理方向的最新进展，每天需面对 ArXiv 上爆发的数百篇新论文。\n\n### 没有 gpt_paper_assistant 时\n- **信息过载严重**：每天手动浏览 `cs.CL` 等分类下的上百篇论文标题和摘要，耗时超过 2 小时，极易遗漏关键研究。\n- **个性化匹配困难**：难以从海量数据中精准筛选出特定导师（如 Percy Liang）或特定细分主题（如“思维链”）的相关文章，全靠人工记忆和关键词搜索。\n- **协作同步滞后**：发现的好文章需要手动复制链接发到课题组 Slack 群，经常因忙碌而忘记分享，导致团队信息不同步。\n- **成本与效率失衡**：若尝试用普通脚本全量调用大模型分析所有论文，API 费用高昂且速度慢，无法作为日常工具持续运行。\n\n### 使用 gpt_paper_assistant 后\n- **智能每日精选**：gpt_paper_assistant 每天自动扫描 ArXiv，利用 GPT-4 结合预设主题过滤无关内容，仅推送最相关的几篇核心论文，将阅读时间压缩至 15 分钟。\n- **精准作者与主题追踪**：通过配置 Semantic Scholar 作者 ID 和自定义主题文件，工具能精准捕捉目标学者的新作及特定技术方向的突破，不再依赖人工检索。\n- **自动化团队分发**：筛选结果自动发布至静态网页并推送到实验室 Slack 频道，确保团队成员第一时间获取高价值情报，促进即时讨论。\n- **低成本稳定运行**：借助标题预过滤机制，单日运行成本仅需几分钱（如 $0.07），并通过 GitHub Actions 实现无人值守的常态化更新。\n\ngpt_paper_assistant 将研究人员从繁琐的信息筛选中解放出来，实现了低成本、高精度的个性化学术情报自动化闭环。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Ftatsu-lab_gpt_paper_assistant_7108d0b7.png","tatsu-lab","Tatsu's shared repositories","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Ftatsu-lab_d160c91d.png","Tatsu's shared repos",null,"https:\u002F\u002Fgithub.com\u002Ftatsu-lab",[80],{"name":81,"color":82,"percentage":83},"Python","#3572A5",100,546,142,"2026-03-23T12:25:03","Apache-2.0","Linux, macOS, Windows","不需要 GPU","未说明",{"notes":92,"python":90,"dependencies":93},"该工具主要依赖外部 API（OpenAI GPT-4\u002F3.5 和 Semantic Scholar），本地几乎无计算需求。可通过 GitHub Actions 定时运行，也可在任意安装了 Python 环境的廉价虚拟机（如 AWS）上通过 crontab 运行。需配置 OpenAI API Key，可选配置 Semantic Scholar API Key 以加速作者搜索，以及 Slack Bot Token 用于通知。依赖库需通过 requirements.txt 安装，但 README 未列出具体包名和版本。",[94,95,96],"requests (隐含，用于 RSS\u002FAPI)","openai (隐含，用于 GPT 调用)","semantic-scholar API (可选)",[98,13,35],"其他",[100,101,102],"arxiv","gpt","research","2026-03-27T02:49:30.150509","2026-04-07T02:01:03.661823",[106,111,116,121,126,131],{"id":107,"question_zh":108,"answer_zh":109,"source_url":110},20737,"如何配置项目以使用 GPT-3.5 而不是 GPT-4？","在 `config.ini` 文件中，注释掉 GPT-4 Turbo 的配置行，并取消注释 GPT-3.5 的配置行即可切换。但请注意，维护者警告称 GPT-3.5 在此任务上表现较差，可能会筛选出大量质量不高的论文。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F3",{"id":112,"question_zh":113,"answer_zh":114,"source_url":115},20738,"为什么推荐的论文相关性低或新颖性不足？","即使温度参数设为 0，语言模型仍存在随机性，尤其在相关性评分较低时影响更大。该系统旨在追求高召回率，因此倾向于包含一些“奇怪”的论文。解决方案是修改提示词（特别是指令微调类别，该部分误报率较高）或调整 `config.ini` 中的截断阈值。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F1",{"id":117,"question_zh":118,"answer_zh":119,"source_url":120},20739,"如何查看或保存历史每日论文推荐记录？","建议使用 Slack 端点，因为 Slack 会自动存储所有历史消息。如果希望拥有网页版存档，可以自行修改代码以存储每天生成的 `output.md` 文件并展示最近几天的内容。但维护者指出，为了减少状态管理的维护开销，官方不会直接提供此功能。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F2",{"id":122,"question_zh":123,"answer_zh":124,"source_url":125},20740,"遇到 'No new papers' 错误或 'ValueError: not enough values to unpack' 怎么办？","这通常是一个已修复的 Bug。该错误代码路径原本只应在周六或周日没有新论文发布时触发。虽然 RSS 源显示有论文，但这些往往是周五已抓取过的。请确保代码已更新到最新版本（参考提交记录 13d54f1），此问题不应导致论文遗漏。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F4",{"id":127,"question_zh":128,"answer_zh":129,"source_url":130},20741,"运行时报错 'UnboundLocalError: local variable cost referenced before assignment' 如何解决？","这是一个计数器相关的代码错误，已在最新的代码提交中修复（参考提交记录 11d0240）。请拉取最新代码重新运行即可解决。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F11",{"id":132,"question_zh":133,"answer_zh":134,"source_url":135},20742,"项目是使用 ArXiv RSS 源还是 API 端点来获取论文？哪个更可靠？","目前项目已切换为仅使用 RSS 源。虽然之前提到 RSS 可能存在格式变更或不稳定的情况，但维护者测试发现它能覆盖绝大多数论文，因此决定暂时只用 RSS 并观察是否有遗漏。","https:\u002F\u002Fgithub.com\u002Ftatsu-lab\u002Fgpt_paper_assistant\u002Fissues\u002F8",[]]