[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-reinforcement-learning":3},{"tag":4,"articles":11,"peer_article_count":137},{"id":5,"name":6,"slug":7,"article_count":8,"description_zh":9,"description_en":10},"d52d08ae-f7f9-4625-ada6-d32a7bcd1036","reinforcement learning","reinforcement-learning",15,"強化學習研究如何讓模型在回饋訊號下逐步學會決策，常見於機器人控制、長期代理訓練與 LLM 微調。這個主題也涵蓋 PPO、BRRL、持續學習與安全約束等方法，重點在穩定更新、長期規劃與部署風險。","Reinforcement learning studies how models learn decisions from feedback over time, and it underpins robot control, long-horizon agent training, and LLM fine-tuning. Recent work spans PPO variants, safe continual RL, stability, and planning under changing environments.",[12,21,28,35,42,49,56,63,71,78,85,92,99,106,113,121,129],{"id":13,"slug":14,"title":15,"summary":16,"category":17,"image_url":18,"cover_image":18,"language":19,"created_at":20},"93b19c63-dbfd-4277-92b5-b5a60946fd65","river-llm-reinforcement-learning-without-answers-zh","RiVER 讓 LLM 不靠標準答案也能學","RiVER 證明 LLM 可以只靠執行回饋與分數校準，在沒有標準答案的任務上學出更好的策略。","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1782454671897-i8l3.png","zh","2026-06-26T06:17:26.979468+00:00",{"id":22,"slug":23,"title":24,"summary":25,"category":17,"image_url":26,"cover_image":26,"language":19,"created_at":27},"a875d002-f6f0-4139-abc1-f1602bc42fee","self-distillation-shrinks-output-diversity-zh","自蒸餾會縮小模型多樣性","這篇論文指出，自蒸餾能拉高 pass@1，卻會壓縮輸出多樣性，讓模型在分布外情境更脆弱。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1782369171288-egwp.png","2026-06-25T06:32:26.557584+00:00",{"id":29,"slug":30,"title":31,"summary":32,"category":17,"image_url":33,"cover_image":33,"language":19,"created_at":34},"88f6d8ec-e98a-42c4-a54c-78b5a8d67a2a","turing-rl-user-simulator-rewards-zh","Turing-RL 讓模擬使用者更像真人","Turing-RL 用 LLM 裁判做強化學習，讓使用者模擬器追求「像真人」而不是對齊單一標準答案。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781763487848-hcqd.png","2026-06-18T06:17:31.073525+00:00",{"id":36,"slug":37,"title":38,"summary":39,"category":17,"image_url":40,"cover_image":40,"language":19,"created_at":41},"4d112775-a2e2-4f60-b930-0fe333a2e42c","contextrl-teaches-llms-to-pick-right-evidence-zh","ContextRL 讓 LLM 學會挑證據","ContextRL 用對比式上下文選擇強化學習，讓模型先挑對證據，再回答問題，改善長上下文與多模態推理的 grounding。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781590688900-6wns.png","2026-06-16T06:17:29.909578+00:00",{"id":43,"slug":44,"title":45,"summary":46,"category":17,"image_url":47,"cover_image":47,"language":19,"created_at":48},"ba442703-edfa-4353-b256-db502d94a99e","mana-articulated-tool-manipulation-animation-zh","Mana把工具操作改寫成動畫","Mana 把具關節工具操作改寫成動畫流程，讓機器人能零樣本把模擬學到的動作直接搬到真實世界。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781246882933-bvjm.png","2026-06-12T06:47:29.612828+00:00",{"id":50,"slug":51,"title":52,"summary":53,"category":17,"image_url":54,"cover_image":54,"language":19,"created_at":55},"75bcc569-5e89-45c8-b809-6f169e929f4b","rl-training-hands-off-control-gradually-zh","RL 先接管再放手","這篇論文證明，RL 可以先借用可用的基線策略，再把控制權逐步交給學到的新策略。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1780986786312-03yo.png","2026-06-09T06:32:32.849589+00:00",{"id":57,"slug":58,"title":59,"summary":60,"category":17,"image_url":61,"cover_image":61,"language":19,"created_at":62},"b38c56a6-e7f3-45fb-b100-d37e7b3ed417","reinforcement-aware-distillation-llm-reasoning-zh","強化感知蒸餾，想把推理一起學進去","這篇論文提出強化感知知識蒸餾，目標不是只壓縮答案，而是把 LLM 的推理行為一起轉移給學生模型。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1780646589500-0me6.png","2026-06-05T08:02:33.908932+00:00",{"id":64,"slug":65,"title":66,"summary":67,"category":68,"image_url":69,"cover_image":69,"language":19,"created_at":70},"cc0a7cf8-f5b6-4932-a9dc-6f7c24f956f8","cursor-composer-2-5-kimi-k2-5-low-price-zh","Cursor Composer 2.5 為什麼這麼便宜","Cursor Composer 2.5 用 Moonshot 的 Kimi K2.5 和即時 RL，把 AI 寫程式成本壓到很低，價格差距直接衝擊 Claude 和 GPT-5.5。","tools","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1780427883819-ocxy.png","2026-06-02T19:17:33.090199+00:00",{"id":72,"slug":73,"title":74,"summary":75,"category":17,"image_url":76,"cover_image":76,"language":19,"created_at":77},"712fec94-021a-4655-bf6b-75ef7be2f5fb","mobilegym-verifiable-parallel-mobile-gui-sim-zh","MobileGym 讓手機 GUI 代理可大規模測試","MobileGym 把手機 GUI 代理的評估變成可重現、可平行擴展的流程，靠結構化狀態與決定性判分，讓訓練和測試更穩定。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1779775564158-bm0a.png","2026-05-26T06:05:35.355803+00:00",{"id":79,"slug":80,"title":81,"summary":82,"category":17,"image_url":83,"cover_image":83,"language":19,"created_at":84},"b9516feb-41d5-42a3-887e-7b47c5c9ffb7","atlas-one-token-visual-reasoning-zh","ATLAS 用一個 token 做視覺推理","ATLAS 提出用單一離散 token 同時承擔 agentic 與 latent 視覺推理，想降低中間步驟成本，並維持標準 next-token 訓練流程。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778912032775-hp0w.png","2026-05-16T06:13:34.693651+00:00",{"id":86,"slug":87,"title":88,"summary":89,"category":17,"image_url":90,"cover_image":90,"language":19,"created_at":91},"7a04d752-3f1a-4df7-b7c5-8bcb1e69c565","bounded-ratio-reinforcement-learning-ppo-zh","BRRL 取代 PPO 剪裁：BPO 與 GBPO 的穩定性升級","BRRL 把 PPO 的剪裁目標改寫成有界比例框架，推出 BPO 與 GBPO，主打更穩定的更新與更清楚的理論基礎。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776751794578-t5j7.png","2026-04-21T06:09:39.661696+00:00",{"id":93,"slug":94,"title":95,"summary":96,"category":17,"image_url":97,"cover_image":97,"language":19,"created_at":98},"46ad5553-2eab-41b1-8602-82bf7fb94933","llm-generalization-shortest-path-scale-zh","LLM 會看地圖，卻撐不住長度","這篇合成最短路徑研究把「會換地圖」和「能拉長題目」拆開看，結果發現 LLM 能跨地圖泛化，卻在長度變長時因遞迴推理不穩而失手。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776406013309-pvmm.png","2026-04-17T06:06:33.258278+00:00",{"id":100,"slug":101,"title":102,"summary":103,"category":17,"image_url":104,"cover_image":104,"language":19,"created_at":105},"ff7d80fb-56b3-4d87-94cc-ad38b20f6e5d","physics-simulators-rl-llm-reasoning-zh","用物理模擬器訓練 LLM 推理","研究者把物理模擬器變成強化學習資料來源，訓練 LLM 學會物理推理，並在 IPhO 題目上帶來 zero-shot 提升。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776146993167-rwzt.png","2026-04-14T06:09:32.812614+00:00",{"id":107,"slug":108,"title":109,"summary":110,"category":17,"image_url":111,"cover_image":111,"language":19,"created_at":112},"5e4f3620-9a8e-4185-84d2-fa8ef42fc058","act-wisely-tool-use-agentic-multimodal-models-zh","教代理何時別叫工具","HDPO 把「答對」和「少叫工具」分開訓練，想修正多模態代理的盲目工具使用。摘要稱它能大幅減少呼叫次數，同時提升推理正確率。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775801029065-5n2l.png","2026-04-10T06:03:34.31315+00:00",{"id":114,"slug":115,"title":116,"summary":117,"category":118,"image_url":119,"cover_image":119,"language":19,"created_at":120},"779f5798-9c39-4ce2-95d7-f0abfd24a695","five-ai-infra-frontiers-bessemer-2026-zh","Bessemer 看準的 5 個 AI 基礎設施前線","Bessemer 2026 AI infra 藍圖指向 memory、continual learning、RL、inference 與 world models。重點不是更大模型，而是讓 AI 真正進到生產環境。","industry","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775164388114-uo7t.png","2026-04-02T21:12:39.852377+00:00",{"id":122,"slug":123,"title":124,"summary":125,"category":126,"image_url":127,"cover_image":127,"language":19,"created_at":128},"c34422da-87f3-4b42-9f47-36ef66e0760e","build-ai-crypto-trading-bot-guide-zh","如何打造 AI 加密貨幣交易機器人","2026 AI 加密貨幣交易機器人實作指南：資料管線、模型選擇、風控、部署與合規，幫你把想法變成能上線的系統。","blockchain","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775121945326-q2q9.png","2026-04-02T08:12:40.708166+00:00",{"id":130,"slug":131,"title":132,"summary":133,"category":68,"image_url":134,"cover_image":135,"language":19,"created_at":136},"ce38adca-0f38-4eae-8155-97ac51582a85","cursor-self-hosted-agents-real-time-rl-zh","Cursor 推自架代理與即時 RL","Cursor 在 2026 年 3 月推出自架雲端代理，並公開 Composer 的即時 RL 訓練法。官方稱新 checkpoint 最快每 5 小時更新一次，企業可把程式碼與工具執行留在自家網路內。",null,"https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1774497189210-w1wd.png","2026-03-28T03:10:51.511587+00:00",24]