[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-benchmark":3},{"tag":4,"articles":10,"peer_article_count":99},{"id":5,"name":6,"slug":6,"article_count":7,"description_zh":8,"description_en":9},"736c4d52-f7e2-4456-a45f-50aae8402b4e","benchmark",6,"Benchmark 不只是比誰分數高，而是用固定任務檢查模型、代理與編譯器在真實條件下的穩定性。從長鏈推理、資料視覺化工作流到程式碼安全與效能，基準測試也在考驗方法是否可信。","Benchmarking is how teams check whether models, agents, and compilers hold up under fixed tasks and real constraints. It covers long-horizon reasoning, data-viz workflows, code safety, and performance, while also exposing how much a score can be distorted by the test itself.",[11,20,27,34,42,49,57,64,71,78,85,92],{"id":12,"slug":13,"title":14,"summary":15,"category":16,"image_url":17,"cover_image":17,"language":18,"created_at":19},"354441d5-652c-4658-a446-14f101f5e084","rootly-benchmark-llama-4-trails-coding-models-en","Rootly benchmark: Llama 4 trails coding models","Rootly AI Labs says Llama 4 lagged coding-focused models on a Mastodon GitHub benchmark, with GPT-4o and Qwen2.5-Coder ahead.","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1782086567786-wz4t.png","en","2026-06-22T00:02:22.751682+00:00",{"id":21,"slug":22,"title":23,"summary":24,"category":16,"image_url":25,"cover_image":25,"language":18,"created_at":26},"d910529d-15c0-498a-a930-85e14c6ef748","reprorepo-github-issues-reproducibility-audits-en","ReproRepo scales reproducibility audits with GitHub issues","ReproRepo uses GitHub issues to scale reproducibility audits for machine learning papers.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781678880894-uawp.png","2026-06-17T06:47:35.608681+00:00",{"id":28,"slug":29,"title":30,"summary":31,"category":16,"image_url":32,"cover_image":32,"language":18,"created_at":33},"2a85882b-ba8c-44c8-809e-e19691776f37","clinhallu-medical-mllm-hallucination-benchmark-en","ClinHallu maps where medical MLLMs hallucinate","ClinHallu diagnoses where medical MLLM hallucinations come from across vision, knowledge, and reasoning stages.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781504273229-o70v.png","2026-06-15T06:17:23.262119+00:00",{"id":35,"slug":36,"title":37,"summary":38,"category":39,"image_url":40,"cover_image":40,"language":18,"created_at":41},"65872119-5c63-409f-b8f9-338096299326","fable-5-claude-code-like-coworker-en","Fable 5 让 Claude Code 更像真同事","我拆了这篇测评，整理出一套把 Fable 5 用进 coding 和 agent 工作流的可复制模板。","ai-agent","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781324307625-of8c.png","2026-06-13T04:18:01.203421+00:00",{"id":43,"slug":44,"title":45,"summary":46,"category":16,"image_url":47,"cover_image":47,"language":18,"created_at":48},"f8a9ee26-3c18-46df-9272-326b66003c35","evoarena-llm-agents-dynamic-environments-en","EvoArena tests LLM agents in changing worlds","EvoArena benchmarks how LLM agents handle changing environments, and EvoMem adds patch-based memory updates to help them adapt.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781245086782-a7nm.png","2026-06-12T06:17:33.232458+00:00",{"id":50,"slug":51,"title":52,"summary":53,"category":54,"image_url":55,"cover_image":55,"language":18,"created_at":56},"d1a3f7e9-4415-4158-afbc-1327e7148fb3","gpt-5-5-senior-engineer-benchmark-every-en","GPT-5.5 scores 62.5 on Every’s engineer test","Every says GPT-5.5 beat Opus 4.7 on its Senior Engineer Benchmark, scoring 62.5 on its best run and landing as OpenAI’s work model.","model-release","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1779538549724-ut61.png","2026-05-23T12:15:27.068447+00:00",{"id":58,"slug":59,"title":60,"summary":61,"category":16,"image_url":62,"cover_image":62,"language":18,"created_at":63},"653c628b-7930-4183-9dbc-8e50cf85c479","cattle-trade-llm-bluffing-bargaining-benchmark-en","Cattle Trade benchmarks LLM bluffing and bargaining","Cattle Trade is a multi-agent benchmark for testing how LLMs bluff, bid, and bargain in negotiation tasks.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1779085436536-nesm.png","2026-05-18T06:23:28.591525+00:00",{"id":65,"slug":66,"title":67,"summary":68,"category":16,"image_url":69,"cover_image":69,"language":18,"created_at":70},"d60602fc-ed44-4c5e-8aa1-b0285672b8ba","entitybench-long-range-video-consistency-en","EntityBench Tackles Long-Range Video Consistency","EntityBench measures whether video models keep characters, objects, and locations consistent across long, multi-shot sequences.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778911850469-mgcy.png","2026-05-16T06:10:29.577019+00:00",{"id":72,"slug":73,"title":74,"summary":75,"category":16,"image_url":76,"cover_image":76,"language":18,"created_at":77},"442f0ac0-6fd2-460b-83ab-694f0627d98f","longmemeval-v2-agent-memory-web-workflows-en","LongMemEval-V2 tests agent memory in web workflows","A new benchmark checks whether agent memory can retain web-environment experience, not just user history, and improve long-term task recall.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778653256519-96ad.png","2026-05-13T06:20:30.047955+00:00",{"id":79,"slug":80,"title":81,"summary":82,"category":16,"image_url":83,"cover_image":83,"language":18,"created_at":84},"f414aa1a-27e8-45d9-b407-d542121915d2","llms-procedural-execution-diagnostic-study-en","When LLMs Stop Following Procedural Steps","A diagnostic benchmark shows LLMs lose procedural fidelity as step counts grow, even when the arithmetic stays simple.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1777875670060-pmbt.png","2026-05-04T06:20:27.84519+00:00",{"id":86,"slug":87,"title":88,"summary":89,"category":16,"image_url":90,"cover_image":90,"language":18,"created_at":91},"6bf86d0c-df4b-4e0c-82b7-1c06b2ef80d5","asmr-bench-sabotage-detection-ml-code-en","ASMR-Bench Tests Sabotage Detection in ML Code","ASMR-Bench probes whether auditors can spot subtle sabotage in ML research codebases, and the answer so far is: not reliably.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776665038230-idp9.png","2026-04-20T06:03:33.439449+00:00",{"id":93,"slug":94,"title":95,"summary":96,"category":16,"image_url":97,"cover_image":97,"language":18,"created_at":98},"9f62add5-cae5-47eb-abd5-2e56d0d5698c","longcot-long-horizon-chain-of-thought-benchmark-en","LongCoT Benchmark: 2,500-Probl. Long-Horizon Reasoning","LongCoT is a 2,500-problem benchmark for measuring whether frontier models can sustain long, interdependent reasoning chains.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776319782523-s0wz.png","2026-04-16T06:09:23.265233+00:00",31]