[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-agent-testing":3},{"tag":4,"articles":10},{"id":5,"name":6,"slug":7,"article_count":8,"description_zh":9,"description_en":9},"b667f2bc-7358-4926-987f-39949e0a6632","agent testing","agent-testing",0,null,[11],{"id":12,"slug":13,"title":14,"summary":15,"category":16,"image_url":17,"cover_image":17,"language":18,"created_at":19},"e891adc0-af64-41c7-bb41-d75e6506d388","ai-benchmarks-2026-evaluations-limits-en","AI Benchmarks 2026: Top Evaluations and Limits","MMLU, HLE, SWE-Bench and agent tests are hitting limits in 2026, while production gaps and contamination keep human review necessary.","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1781381870944-h208.png","en","2026-06-13T20:17:26.361723+00:00"]