[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-llm-inference":3},{"tag":4,"articles":11,"peer_article_count":57},{"id":5,"name":6,"slug":7,"article_count":8,"description_zh":9,"description_en":10},"a487ff8b-bc7c-473d-b9f2-867dd22c9327","LLM inference","llm-inference",4,"LLM 推論聚焦模型在部署時的延遲、吞吐量與記憶體成本，尤其是 KV cache、量化與加速器友善的實作。這類技術直接影響大模型能否在雲端與邊緣裝置上穩定運行。","LLM inference covers the runtime side of large models: latency, throughput, memory footprint, and how KV cache, quantization, and accelerator-friendly kernels shape deployment. It matters because these choices determine whether a model is practical on GPUs, servers, or edge devices.",[12,21,29,36,43,50],{"id":13,"slug":14,"title":15,"summary":16,"category":17,"image_url":18,"cover_image":18,"language":19,"created_at":20},"cdee5351-7d06-4653-bf20-5530b470ebfe","openai-jalapeno-llm-inference-chip-zh","OpenAI Jalapeño 指向更快的 LLM 推理","1 顆 Jalapeño、1 次 Broadcom 合作、1 個多代平台：OpenAI 正把 LLM 推理做成更快、更穩、也更可控的硬體路線。","industry","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1782598661806-0sm3.png","zh","2026-06-27T22:17:20.214678+00:00",{"id":22,"slug":23,"title":24,"summary":25,"category":26,"image_url":27,"cover_image":27,"language":19,"created_at":28},"6f25a29c-cbb8-4f53-9af7-1656b394333a","turboquant-cuts-kv-cache-memory-6x-google-tests-zh","TurboQuant 在 Google 測試中省下 6x KV 快取","Google Research 公布 TurboQuant，可把 KV cache 記憶體壓到至少 6x 以上，並在長上下文測試中維持接近全精度表現。","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1780906682236-sqe2.png","2026-06-08T08:17:21.878314+00:00",{"id":30,"slug":31,"title":32,"summary":33,"category":26,"image_url":34,"cover_image":34,"language":19,"created_at":35},"9580adce-69ec-4880-ad8b-227c384cb377","marlin-greener-llm-inference-datacenters-zh","MARLIN 用多代理 RL 省雲端推理資源","MARLIN 把雲端 LLM 推理視為多代理協調問題，用遊戲理論式強化學習來追求更永續的資料中心運作。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1779084247021-qzhd.png","2026-05-18T06:03:35.259834+00:00",{"id":37,"slug":38,"title":39,"summary":40,"category":26,"image_url":41,"cover_image":41,"language":19,"created_at":42},"941f698a-1dcf-4807-bd56-5295c07d2dee","taming-black-box-llm-inference-scheduling-zh","黑箱 LLM 排程更聰明了","這篇論文用「預測輸出長度」來改善黑箱 LLM 推論排程，想在看不到模型內部的情況下，減少排隊摩擦、提升大規模服務效率。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778740253221-wgy6.png","2026-05-14T06:30:31.546746+00:00",{"id":44,"slug":45,"title":46,"summary":47,"category":26,"image_url":48,"cover_image":48,"language":19,"created_at":49},"db0d0cbe-b1ba-4f1e-9569-f902e41bb3b0","saga-workflow-atomic-scheduling-gpu-clusters-zh","SAGA 讓 AI Agent 排程看懂工作流","SAGA 主張 GPU 排程不該把 AI agent 的每次 LLM 呼叫拆開看，而是要把一連串請求當成同一個工作流來排。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778567467043-imbu.png","2026-05-12T06:30:31.788116+00:00",{"id":51,"slug":52,"title":53,"summary":54,"category":26,"image_url":55,"cover_image":55,"language":19,"created_at":56},"13197f11-d68b-468c-aa9f-9e84b85673d2","speckv-adaptive-speculative-decoding-gamma-zh","SpecKV 讓推測解碼自動調 gamma","SpecKV 把推測解碼的 token 預算改成逐步自動調整，利用 draft 模型訊號在不同壓縮設定下挑出更合適的 gamma。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1777961462925-xmg2.png","2026-05-05T06:10:32.259958+00:00",16]