{"last_updated":"2026-05-04","leaderboard":[{"run_name":"icl-claude-sonnet-4.6","tasks_included":6,"normalized_reward_mean":0.22311631078975627,"normalized_gain_mean":0.25441550654189665,"total_cost_usd_mean":30.42709704,"mean_task_cost_usd":5.07118284,"rank":1,"display_name":"ICL \u00b7 Claude Sonnet 4.6","system_label":"ICL","system_class":"icl","model":"Claude Sonnet 4.6"},{"run_name":"icl-gpt-5.4","tasks_included":6,"normalized_reward_mean":0.20064928943952073,"normalized_gain_mean":0.20064928943952073,"total_cost_usd_mean":18.3895509,"mean_task_cost_usd":3.06492515,"rank":2,"display_name":"ICL \u00b7 GPT-5.4","system_label":"ICL","system_class":"icl","model":"GPT-5.4"},{"run_name":"claude-code-sonnet-4.6","tasks_included":6,"normalized_reward_mean":0.19014463901361522,"normalized_gain_mean":0.23900154529064965,"total_cost_usd_mean":38.59765761,"mean_task_cost_usd":6.432942935,"rank":3,"display_name":"Claude Code \u00b7 Sonnet 4.6","system_label":"Claude Code","system_class":"claude-code","model":"Sonnet 4.6"},{"run_name":"mem0-gpt-5.4","tasks_included":6,"normalized_reward_mean":0.15050524004562157,"normalized_gain_mean":0.20240275723200707,"total_cost_usd_mean":18.3428094,"mean_task_cost_usd":3.0571349,"rank":4,"display_name":"Mem0 \u00b7 GPT-5.4","system_label":"Mem0","system_class":"mem0","model":"GPT-5.4"},{"run_name":"icl-claude-opus-4.7","tasks_included":6,"normalized_reward_mean":0.1017418903675021,"normalized_gain_mean":0.1945244012067302,"total_cost_usd_mean":49.6220976,"mean_task_cost_usd":8.2703496,"rank":5,"display_name":"ICL \u00b7 Claude Opus 4.7","system_label":"ICL","system_class":"icl","model":"Claude Opus 4.7"},{"run_name":"icl-notepad-gpt-5.4","tasks_included":6,"normalized_reward_mean":0.07969184363257699,"normalized_gain_mean":0.07804636575885458,"total_cost_usd_mean":14.2771118,"mean_task_cost_usd":2.3795186333333334,"rank":6,"display_name":"ICL Notepad \u00b7 GPT-5.4","system_label":"ICL Notepad","system_class":"icl-notepad","model":"GPT-5.4"},{"run_name":"icl-gemini-3-flash","tasks_included":6,"normalized_reward_mean":0.07961924623425214,"normalized_gain_mean":0.1640453480484305,"total_cost_usd_mean":7.60189571,"mean_task_cost_usd":1.2669826183333333,"rank":7,"display_name":"ICL \u00b7 Gemini 3 Flash","system_label":"ICL","system_class":"icl","model":"Gemini 3 Flash"},{"run_name":"codex-gpt-5.4","tasks_included":6,"normalized_reward_mean":0.06569361292259214,"normalized_gain_mean":0.14575997342763694,"total_cost_usd_mean":27.208425599999998,"mean_task_cost_usd":4.5347376,"rank":8,"display_name":"Codex \u00b7 GPT-5.4","system_label":"Codex","system_class":"codex","model":"GPT-5.4"},{"run_name":"ace-gpt-5.4","tasks_included":6,"normalized_reward_mean":0.04609805079359747,"normalized_gain_mean":0.08573041367877687,"total_cost_usd_mean":62.754812,"mean_task_cost_usd":10.459135333333334,"rank":9,"display_name":"ACE \u00b7 GPT-5.4","system_label":"ACE","system_class":"ace","model":"GPT-5.4"},{"run_name":"icl-notepad-claude-sonnet-4-6","tasks_included":6,"normalized_reward_mean":0.03471882200182727,"normalized_gain_mean":0.18218790713771763,"total_cost_usd_mean":31.52601021,"mean_task_cost_usd":5.2543350349999995,"rank":10,"display_name":"ICL Notepad \u00b7 Claude Sonnet 4.6","system_label":"ICL Notepad","system_class":"icl-notepad","model":"Claude Sonnet 4.6"},{"run_name":"icl-notepad-gemini-3.1-pro-preview","tasks_included":6,"normalized_reward_mean":-0.0015973176027631296,"normalized_gain_mean":0.09359157264606553,"total_cost_usd_mean":13.31752216,"mean_task_cost_usd":2.2195870266666664,"rank":11,"display_name":"ICL Notepad \u00b7 Gemini 3.1 Pro Preview","system_label":"ICL Notepad","system_class":"icl-notepad","model":"Gemini 3.1 Pro Preview"},{"run_name":"icl-gemini-3.1-pro-preview","tasks_included":6,"normalized_reward_mean":-0.05629401792134888,"normalized_gain_mean":0.06182060780612062,"total_cost_usd_mean":15.228114,"mean_task_cost_usd":2.538019,"rank":12,"display_name":"ICL \u00b7 Gemini 3.1 Pro Preview","system_label":"ICL","system_class":"icl","model":"Gemini 3.1 Pro Preview"}],"task_summary":[{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":19.777959999999997,"reward_se":0.00949555685570934,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain_mean":0.01695999999999458,"gain_se":0.00949555685570934,"normalized_reward_mean":0.00024146129643068067,"normalized_reward_se":0.00013518923754195445,"normalized_gain_mean":0.00024146129643068067,"normalized_gain_se":0.00013518923754195445,"cost_mean":3.955291,"cost_se":0.3275994812520694,"latency_mean":3.9863659999999994,"latency_se":0.07174654576981385,"warnings":""},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":11.58,"reward_se":0.1806931099959262,"baseline_reward":10.0,"reference_reward":19.0,"gain_mean":1.5799999999999994,"gain_se":0.1806931099959262,"normalized_reward_mean":0.22303664921465968,"normalized_reward_se":0.01892074450219122,"normalized_gain_mean":0.1755555555555555,"normalized_gain_se":0.020077012221769573,"cost_mean":15.6596135,"cost_se":0.7904778063130553,"latency_mean":3.8248995999999997,"latency_se":0.028832460945260947,"warnings":""},{"run_name":"ace-gpt-5.4","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.7587200000000001,"reward_se":0.12682443928517878,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain_mean":0.38322,"gain_se":0.12682443928517878,"normalized_reward_mean":-0.10481253945251026,"normalized_reward_se":0.05637794361744126,"normalized_gain_mean":0.1335940931623753,"normalized_gain_se":0.044212191318642506,"cost_mean":12.797261500000001,"cost_se":1.1101032262253812,"latency_mean":6.3340134,"latency_se":0.08373720034823236,"warnings":""},{"run_name":"ace-gpt-5.4","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":7.8533333333333335,"reward_se":1.2908911650483939,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain_mean":2.386666666666666,"gain_se":1.2908911650483939,"normalized_reward_mean":0.06731141199226306,"normalized_reward_se":0.03745332200333831,"normalized_gain_mean":0.0691119691119691,"normalized_gain_se":0.037381018292907166,"cost_mean":8.775943999999999,"cost_se":0.6972264993823741,"latency_mean":3.7047094,"latency_se":0.0250713825697747,"warnings":""},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":143.5,"reward_se":41.19626196634835,"baseline_reward":141.9,"reference_reward":1138.5,"gain_mean":1.5999999999999943,"gain_se":41.19626196634835,"normalized_reward_mean":0.009851726539954229,"normalized_reward_se":0.04099538458189706,"normalized_gain_mean":0.001605458559100937,"normalized_gain_se":0.04133680711052414,"cost_mean":13.202943,"cost_se":0.8066581250262561,"latency_mean":4.8480474000000005,"latency_se":0.07989380825671039,"warnings":""},{"run_name":"ace-gpt-5.4","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":6.11566,"reward_se":0.128647835582259,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain_mean":0.9126600000000007,"gain_se":0.128647835582259,"normalized_reward_mean":0.08095959517078738,"normalized_reward_se":0.020092747681799708,"normalized_gain_mean":0.13427394438722975,"normalized_gain_se":0.018927149563374872,"cost_mean":8.363759,"cost_se":0.4792756011423647,"latency_mean":7.2718875999999995,"latency_se":0.06730373458999146,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":44.28216000000001,"reward_se":1.4489101330310272,"baseline_reward":19.7601,"reference_reward":90.0,"gain_mean":24.52206000000001,"gain_se":1.4489101330310272,"normalized_reward_mean":0.3491103233246488,"normalized_reward_se":0.020628285326257878,"normalized_gain_mean":0.34911866332383745,"normalized_gain_se":0.020628021011291693,"cost_mean":10.397260560000001,"cost_se":2.26760027497843,"latency_mean":23.9291564,"latency_se":4.449749971346858,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":6.63,"reward_se":1.7549109664025693,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain_mean":0.7300000000000004,"gain_se":1.7549109664025693,"normalized_reward_mean":-0.29528795811518316,"normalized_reward_se":0.18376031061807008,"normalized_gain_mean":0.05572519083969468,"normalized_gain_se":0.1339626691910358,"cost_mean":6.7811889899999995,"cost_se":0.4300103230018283,"latency_mean":15.545831799999998,"latency_se":9.80385859659674,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.49602,"reward_se":0.2259065612150298,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain_mean":-0.3085800000000001,"gain_se":0.2259065612150298,"normalized_reward_mean":-0.22159196991384902,"normalized_reward_se":0.10042344711142268,"normalized_gain_mean":-0.12649624503984525,"normalized_gain_se":0.09260591005108952,"cost_mean":7.21415448,"cost_se":0.4958424114853072,"latency_mean":25.1392686,"latency_se":2.337431064308563,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":22.053333333333338,"reward_se":1.1815996689986747,"baseline_reward":8.2,"reference_reward":40.0,"gain_mean":13.853333333333339,"gain_se":1.1815996689986747,"normalized_reward_mean":0.479303675048356,"normalized_reward_se":0.034282388849091135,"normalized_gain_mean":0.4356394129979037,"normalized_gain_se":0.03715722229555581,"cost_mean":3.33835056,"cost_se":0.22764057626085432,"latency_mean":5.7853086000000005,"latency_se":0.3225546125147802,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":343.02,"reward_se":29.366722663586412,"baseline_reward":284.5,"reference_reward":1138.5,"gain_mean":58.52,"gain_se":29.366722663586412,"normalized_reward_mean":0.20839884565628425,"normalized_reward_se":0.029223527379427218,"normalized_gain_mean":0.06852459016393443,"normalized_gain_se":0.034387263072115246,"cost_mean":8.652264630000001,"cost_se":0.3031440379164707,"latency_mean":6.608841,"latency_se":0.35659030944684966,"warnings":""},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":9.57296,"reward_se":0.25362297332852174,"baseline_reward":5.0358,"reference_reward":12.0,"gain_mean":4.53716,"gain_se":0.25362297332852174,"normalized_reward_mean":0.6209349180814344,"normalized_reward_se":0.03961187832141468,"normalized_gain_mean":0.6514976594583729,"normalized_gain_se":0.036418105931553056,"cost_mean":2.21443839,"cost_se":0.08254449655946693,"latency_mean":12.1489416,"latency_se":0.7661478202889832,"warnings":""},{"run_name":"codex-gpt-5.4","task":"blind_spectrum_monitoring","n_runs":1,"n_instances":90,"reward_mean":32.82769999999999,"reward_se":0.0,"baseline_reward":19.7601,"reference_reward":90.0,"gain_mean":13.067599999999992,"gain_se":0.0,"normalized_reward_mean":0.18603197653725123,"normalized_reward_se":0.0,"normalized_gain_mean":0.1860424060968195,"normalized_gain_se":0.0,"cost_mean":3.1478285,"cost_se":0.0,"latency_mean":11.084235,"latency_se":0.0,"warnings":""},{"run_name":"codex-gpt-5.4","task":"codebase_adaptation","n_runs":1,"n_instances":19,"reward_mean":7.45,"reward_se":0.0,"baseline_reward":8.25,"reference_reward":19.0,"gain_mean":-0.7999999999999998,"gain_se":0.0,"normalized_reward_mean":-0.2094240837696334,"normalized_reward_se":0.0,"normalized_gain_mean":-0.07441860465116278,"normalized_gain_se":0.0,"cost_mean":3.7940515,"cost_se":0.0,"latency_mean":6.205719,"latency_se":0.0,"warnings":""},{"run_name":"codex-gpt-5.4","task":"cohort_studies","n_runs":1,"n_instances":20,"reward_mean":0.8206,"reward_se":0.0,"baseline_reward":0.5048999999999999,"reference_reward":3.24404,"gain_mean":0.3157000000000001,"gain_se":0.0,"normalized_reward_mean":-0.07730469340398478,"normalized_reward_se":0.0,"normalized_gain_mean":0.11525515307724327,"normalized_gain_se":0.0,"cost_mean":7.763108,"cost_se":0.0,"latency_mean":13.096974,"latency_se":0.0,"warnings":""},{"run_name":"codex-gpt-5.4","task":"database_exploration","n_runs":1,"n_instances":40,"reward_mean":9.600000000000001,"reward_se":0.0,"baseline_reward":3.466666666666667,"reference_reward":40.0,"gain_mean":6.133333333333335,"gain_se":0.0,"normalized_reward_mean":0.11798839458413933,"normalized_reward_se":0.0,"normalized_gain_mean":0.16788321167883216,"normalized_gain_se":0.0,"cost_mean":1.8486595,"cost_se":0.0,"latency_mean":4.744029,"latency_se":0.0,"warnings":""},{"run_name":"codex-gpt-5.4","task":"exploitable_poker","n_runs":1,"n_instances":120,"reward_mean":85.0,"reward_se":0.0,"baseline_reward":64.5,"reference_reward":1138.5,"gain_mean":20.5,"gain_se":0.0,"normalized_reward_mean":-0.04836302119613892,"normalized_reward_se":0.0,"normalized_gain_mean":0.019087523277467412,"normalized_gain_se":0.0,"cost_mean":8.271512,"cost_se":0.0,"latency_mean":5.879567,"latency_se":0.0,"warnings":""},{"run_name":"codex-gpt-5.4","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":8.319939999999999,"reward_se":0.19792417133841939,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain_mean":3.1438399999999995,"gain_se":0.19792417133841939,"normalized_reward_mean":0.4252331047839194,"normalized_reward_se":0.030912610514067407,"normalized_gain_mean":0.46071015108662206,"normalized_gain_se":0.029004553310924743,"cost_mean":2.3832661,"cost_se":0.3322053896555337,"latency_mean":11.0281868,"latency_se":0.31744261101172283,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":33.57226,"reward_se":3.0817330003749537,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":13.812560000000001,"gain_se":3.0817330003749537,"normalized_reward_mean":0.19663235524423747,"normalized_reward_se":0.04387495551438594,"normalized_gain_mean":0.1966472238871417,"normalized_gain_se":0.04387414348137684,"cost_mean":7.576156299999999,"cost_se":0.42259227315663245,"latency_mean":13.06865,"latency_se":0.9791905247836601,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":10.36,"reward_se":0.9236002923342967,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain_mean":1.4849999999999979,"gain_se":0.9236002923342967,"normalized_reward_mean":0.09528795811518327,"normalized_reward_se":0.09671207249573788,"normalized_gain_mean":0.14666666666666647,"normalized_gain_se":0.09121978195894291,"cost_mean":8.03356235,"cost_se":1.440226736655041,"latency_mean":4.738942799999999,"latency_se":0.18964228491330729,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":-0.1207232,"reward_se":0.09252325762606935,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain_mean":-0.0908832,"gain_se":0.09252325762606935,"normalized_reward_mean":-0.49575611013807264,"normalized_reward_se":0.0411298566044922,"normalized_gain_mean":-0.02776008894644886,"normalized_gain_se":0.02826104121900294,"cost_mean":6.967903699999999,"cost_se":0.21767060035276636,"latency_mean":8.0774976,"latency_se":0.26669492397467937,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":15.653333333333336,"reward_se":1.7811856226183238,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain_mean":9.58666666666667,"gain_se":1.7811856226183238,"normalized_reward_mean":0.2936170212765959,"normalized_reward_se":0.05167849968911964,"normalized_gain_mean":0.28251473477406686,"normalized_gain_se":0.05249073544061857,"cost_mean":5.219724,"cost_se":0.22425686644681306,"latency_mean":4.4711024,"latency_se":0.12573060186151974,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":116.67999999999999,"reward_se":26.48460685001762,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain_mean":-41.02000000000002,"gain_se":26.48460685001762,"normalized_reward_mean":-0.016837496268285394,"normalized_reward_se":0.026355465071168895,"normalized_gain_mean":-0.041823001631321396,"normalized_gain_se":0.027003065711681914,"cost_mean":17.46652605,"cost_se":0.6593005170088617,"latency_mean":4.945791,"latency_se":0.08458654511150107,"warnings":""},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":9.0388,"reward_se":0.2915972616469503,"baseline_reward":4.3896,"reference_reward":12.0,"gain_mean":4.6492,"gain_se":0.2915972616469503,"normalized_reward_mean":0.5375076139753541,"normalized_reward_se":0.04554285873880555,"normalized_gain_mean":0.6109008724902765,"normalized_gain_se":0.038315628829884145,"cost_mean":4.3582252,"cost_se":0.13837229127733633,"latency_mean":9.5466268,"latency_se":0.1901113062740879,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":36.58424,"reward_se":1.2621533403671679,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":16.82454,"gain_se":1.2621533403671679,"normalized_reward_mean":0.23951422998618993,"normalized_reward_se":0.01796940930775165,"normalized_gain_mean":0.23952830497591834,"normalized_gain_se":0.017969076731835825,"cost_mean":3.59543283,"cost_se":0.16672859214934213,"latency_mean":9.8436802,"latency_se":0.6945913129143929,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":9.754999999999999,"reward_se":0.222963001415033,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain_mean":2.7049999999999983,"gain_se":0.222963001415033,"normalized_reward_mean":0.03193717277486908,"normalized_reward_se":0.02334691114293539,"normalized_gain_mean":0.22635983263598317,"normalized_gain_se":0.01865799175021197,"cost_mean":6.92012301,"cost_se":0.7416783388544078,"latency_mean":5.2033894,"latency_se":0.13644269594544076,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.76154,"reward_se":0.2397795875382223,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain_mean":0.18543999999999997,"gain_se":0.2397795875382223,"normalized_reward_mean":-0.10355894982974294,"normalized_reward_se":0.10659049740756878,"normalized_gain_mean":0.06950681049798721,"normalized_gain_se":0.08987443028637163,"cost_mean":5.61744636,"cost_se":0.39805137991260536,"latency_mean":15.9242066,"latency_se":1.8087371921105233,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":15.013333333333335,"reward_se":1.4271027370943634,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain_mean":8.480000000000004,"gain_se":1.4271027370943634,"normalized_reward_mean":0.27504835589941984,"normalized_reward_se":0.04140530184993317,"normalized_gain_mean":0.25338645418326705,"normalized_gain_se":0.04264251206457261,"cost_mean":1.94947071,"cost_se":0.15903059876278475,"latency_mean":2.0253205999999997,"latency_se":0.05402848531432287,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":339.91999999999996,"reward_se":19.529603170571605,"baseline_reward":316.7,"reference_reward":1138.5,"gain_mean":23.21999999999999,"gain_se":19.529603170571605,"normalized_reward_mean":0.2053139615882177,"normalized_reward_se":0.019434374734373174,"normalized_gain_mean":0.028255049890484295,"normalized_gain_se":0.023764423424886355,"cost_mean":9.38512821,"cost_se":0.40901446388615886,"latency_mean":8.432848400000001,"latency_se":0.2466075688257763,"warnings":""},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":10.018,"reward_se":0.21541208183386595,"baseline_reward":5.1783,"reference_reward":12.0,"gain_mean":4.839700000000001,"gain_se":0.21541208183386595,"normalized_reward_mean":0.6904430943195841,"normalized_reward_se":0.03364394424756212,"normalized_gain_mean":0.7094565870677397,"normalized_gain_se":0.03157747802364015,"cost_mean":2.95949592,"cost_se":0.12814693688487694,"latency_mean":15.6693362,"latency_se":1.0930564757940187,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":33.038859999999985,"reward_se":0.8792992571360508,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":13.279159999999987,"gain_se":0.8792992571360508,"normalized_reward_mean":0.189038283574652,"normalized_reward_se":0.01251867562374252,"normalized_gain_mean":0.18905329276782684,"normalized_gain_se":0.012518443929425853,"cost_mean":0.68096565,"cost_se":0.02356635676358354,"latency_mean":3.9808568,"latency_se":0.17519605824201637,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":7.419999999999999,"reward_se":0.9337223891500082,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain_mean":-0.3299999999999995,"gain_se":0.9337223891500082,"normalized_reward_mean":-0.21256544502617797,"normalized_reward_se":0.09777197792146683,"normalized_gain_mean":-0.029333333333333288,"normalized_gain_se":0.08299754570222294,"cost_mean":2.75498705,"cost_se":0.3530711465907466,"latency_mean":6.53639,"latency_se":0.9478799716914056,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.5761200000000001,"reward_se":0.2147917046815356,"baseline_reward":0.1396,"reference_reward":3.24404,"gain_mean":0.4365200000000001,"gain_se":0.21479170468153563,"normalized_reward_mean":-0.18598469020333042,"normalized_reward_se":0.09548250072527521,"normalized_gain_mean":0.14061151125484792,"normalized_gain_se":0.06918855081159102,"cost_mean":1.2471353,"cost_se":0.14991701713337696,"latency_mean":4.5149222,"latency_se":0.7101842874071068,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":15.026666666666667,"reward_se":0.7119925093238924,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain_mean":11.493333333333336,"gain_se":0.7119925093238924,"normalized_reward_mean":0.27543520309477765,"normalized_reward_se":0.020657422901080056,"normalized_gain_mean":0.31517367458866546,"normalized_gain_se":0.019524474661532697,"cost_mean":0.41665848,"cost_se":0.015385875475250022,"latency_mean":1.4732294,"latency_se":0.040382675804854766,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":94.84,"reward_se":16.912143566088837,"baseline_reward":196.8,"reference_reward":1138.5,"gain_mean":-101.96000000000001,"gain_se":16.912143566088837,"normalized_reward_mean":-0.03857100208976017,"normalized_reward_se":0.016829678143187216,"normalized_gain_mean":-0.10827227354783901,"normalized_gain_se":0.01795916275468709,"cost_mean":1.9132159899999999,"cost_se":0.14958022277078145,"latency_mean":2.4436378,"latency_se":0.08005127874831228,"warnings":""},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":8.48084,"reward_se":0.21057797985544463,"baseline_reward":5.2707,"reference_reward":12.0,"gain_mean":3.2101400000000013,"gain_se":0.21057797985544463,"normalized_reward_mean":0.4503631280553518,"normalized_reward_se":0.032888934333241394,"normalized_gain_mean":0.47703921656041504,"normalized_gain_se":0.031292702042626214,"cost_mean":0.58893324,"cost_se":0.05042373644450201,"latency_mean":3.5718224,"latency_se":0.08312163899827767,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":33.03251999999999,"reward_se":1.1359706622091956,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":13.27281999999999,"gain_se":1.1359706622091956,"normalized_reward_mean":0.1889480203305854,"normalized_reward_se":0.01617293330214262,"normalized_gain_mean":0.1889630311943427,"normalized_gain_se":0.016172633975213596,"cost_mean":3.83594968,"cost_se":0.17068404696924194,"latency_mean":15.7138168,"latency_se":1.4278658441258898,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":5.125,"reward_se":0.6515846069391142,"baseline_reward":6.675,"reference_reward":19.0,"gain_mean":-1.5499999999999996,"gain_se":0.6515846069391142,"normalized_reward_mean":-0.45287958115183236,"normalized_reward_se":0.0682287546533104,"normalized_gain_mean":-0.1257606490872211,"normalized_gain_se":0.052866905228325704,"cost_mean":3.4006354,"cost_se":0.9562494786153496,"latency_mean":4.6171954,"latency_se":0.23019030393667747,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.2572,"reward_se":0.14307361391954843,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain_mean":-0.5628999999999997,"gain_se":0.14307361391954843,"normalized_reward_mean":-0.32775589676111555,"normalized_reward_se":0.06360127578062556,"normalized_gain_mean":-0.23222522009620694,"normalized_gain_se":0.05902522913914883,"cost_mean":1.83194716,"cost_se":0.10305024369804082,"latency_mean":9.9559316,"latency_se":1.6777096085564864,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":11.56,"reward_se":2.893563738905903,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain_mean":6.826666666666668,"gain_se":2.893563738905903,"normalized_reward_mean":0.17485493230174087,"normalized_reward_se":0.08395252627386567,"normalized_gain_mean":0.19357277882797735,"normalized_gain_se":0.08204812114099914,"cost_mean":1.32027676,"cost_se":0.14872640669066672,"latency_mean":6.0846158,"latency_se":0.28502198711632754,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":76.4,"reward_se":3.867815921162743,"baseline_reward":43.5,"reference_reward":1138.5,"gain_mean":32.9,"gain_se":3.867815921162743,"normalized_reward_mean":-0.056921086675291076,"normalized_reward_se":0.0038489560365834847,"normalized_gain_mean":0.030045662100456623,"normalized_gain_se":0.0035322519827970253,"cost_mean":3.9894627999999996,"cost_se":0.08792257000352074,"latency_mean":6.4949088,"latency_se":0.3019363284163402,"warnings":""},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":6.467999999999999,"reward_se":0.05444100476662789,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain_mean":2.5595999999999997,"gain_se":0.05444100476662789,"normalized_reward_mean":0.13598950442781943,"normalized_reward_se":0.008502819867653942,"normalized_gain_mean":0.31632804389737507,"normalized_gain_se":0.006728088977041367,"cost_mean":0.8498422,"cost_se":0.006386128874051945,"latency_mean":9.5360972,"latency_se":0.26997037789976147,"warnings":""},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":46.197979999999994,"reward_se":1.0011550436371022,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain_mean":26.436979999999995,"gain_se":1.0011550436371022,"normalized_reward_mean":0.3763860533321942,"normalized_reward_se":0.014253549219623025,"normalized_gain_mean":0.3763860533321942,"normalized_gain_se":0.014253549219623025,"cost_mean":1.9346845000000001,"cost_se":0.0450638136843188,"latency_mean":6.1384548,"latency_se":0.09588265706497714,"warnings":""},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":10.14,"reward_se":1.9586123914649376,"baseline_reward":9.45,"reference_reward":19.0,"gain_mean":0.6900000000000013,"gain_se":1.9586123914649376,"normalized_reward_mean":0.07225130890052367,"normalized_reward_se":0.20509030277119764,"normalized_gain_mean":0.07225130890052367,"normalized_gain_se":0.20509030277119764,"cost_mean":3.5589615,"cost_se":0.695756590390939,"latency_mean":5.982151,"latency_se":0.27182068753996635,"warnings":""},{"run_name":"icl-gpt-5.4","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.9571,"reward_se":0.39280120290039844,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain_mean":-0.03739999999999995,"gain_se":0.39280120290039844,"normalized_reward_mean":-0.016625621238119774,"normalized_reward_se":0.1746140112647023,"normalized_gain_mean":-0.016625621238119774,"normalized_gain_se":0.1746140112647023,"cost_mean":3.7341352999999997,"cost_se":0.3393777282411002,"latency_mean":11.216186400000002,"latency_se":1.3149000882897757,"warnings":""},{"run_name":"icl-gpt-5.4","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":13.880000000000004,"reward_se":1.7153425313913258,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain_mean":8.346666666666671,"gain_se":1.715342531391326,"normalized_reward_mean":0.242166344294004,"normalized_reward_se":0.049768158551005595,"normalized_gain_mean":0.242166344294004,"normalized_gain_se":0.049768158551005595,"cost_mean":1.0318076999999999,"cost_se":0.048060063507708764,"latency_mean":3.7732802,"latency_se":0.09005126277093506,"warnings":""},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":95.76,"reward_se":26.414439233116422,"baseline_reward":133.6,"reference_reward":1138.5,"gain_mean":-37.839999999999996,"gain_se":26.414439233116422,"normalized_reward_mean":-0.03765548810826948,"normalized_reward_se":0.02628563959908093,"normalized_gain_mean":-0.03765548810826948,"normalized_gain_se":0.02628563959908093,"cost_mean":4.6101349,"cost_se":0.0895960940607067,"latency_mean":6.4486556,"latency_se":0.08940567952462525,"warnings":""},{"run_name":"icl-gpt-5.4","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":9.23002,"reward_se":0.2535385520980981,"baseline_reward":5.5973,"reference_reward":12.0,"gain_mean":3.6327200000000004,"gain_se":0.2535385520980981,"normalized_reward_mean":0.5673731394567917,"normalized_reward_se":0.03959869306669032,"normalized_gain_mean":0.5673731394567917,"normalized_gain_se":0.03959869306669032,"cost_mean":3.5198270000000003,"cost_se":0.06747302407110559,"latency_mean":11.3441266,"latency_se":0.6226097109842249,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":35.99298,"reward_se":2.4137046839661256,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":16.23328,"gain_se":2.4137046839661256,"normalized_reward_mean":0.23109639943621058,"normalized_reward_se":0.034364166402797956,"normalized_gain_mean":0.23111063022225134,"normalized_gain_se":0.03436353039446195,"cost_mean":2.985492,"cost_se":0.27217110069458145,"latency_mean":18.3305742,"latency_se":2.0345622181599756,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":8.765,"reward_se":0.45487635682677574,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain_mean":0.8900000000000015,"gain_se":0.45487635682677574,"normalized_reward_mean":-0.07172774869109934,"normalized_reward_se":0.047631032128458185,"normalized_gain_mean":0.08000000000000013,"normalized_gain_se":0.04088776241139557,"cost_mean":4.24174809,"cost_se":0.5284832598783283,"latency_mean":5.8407932,"latency_se":1.0958890414535314,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":-0.783885,"reward_se":0.35783186727945293,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain_mean":0.7952790000000002,"gain_se":0.35783186727945293,"normalized_reward_mean":-0.7905549579025044,"normalized_reward_se":0.15906890621169348,"normalized_gain_mean":0.16488603840932295,"normalized_gain_se":0.07418966049942173,"cost_mean":11.55971091,"cost_se":0.22609314851865803,"latency_mean":24.3249584,"latency_se":0.5298055375775715,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":11.0,"reward_se":0.8773698067393116,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain_mean":3.666666666666667,"gain_se":0.8773698067393116,"normalized_reward_mean":0.1586073500967118,"normalized_reward_se":0.02545560367715604,"normalized_gain_mean":0.11224489795918366,"normalized_gain_se":0.02685825938997892,"cost_mean":2.24002683,"cost_se":0.04685138941926268,"latency_mean":2.3692181999999997,"latency_se":0.03077355713010767,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":115.42,"reward_se":29.931445003541008,"baseline_reward":317.0,"reference_reward":1138.5,"gain_mean":-201.57999999999998,"gain_se":29.931445003541008,"normalized_reward_mean":-0.018091352373370475,"normalized_reward_se":0.029785496072784363,"normalized_gain_mean":-0.2453804017041996,"normalized_gain_se":0.03643511260321486,"cost_mean":6.74607549,"cost_se":0.30715313468392663,"latency_mean":17.0544942,"latency_se":0.878624421870141,"warnings":""},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":10.07268,"reward_se":0.09509674757845332,"baseline_reward":4.2825,"reference_reward":12.0,"gain_mean":5.79018,"gain_se":0.09509674757845332,"normalized_reward_mean":0.6989832414450154,"normalized_reward_se":0.014852600868142088,"normalized_gain_mean":0.7502662779397473,"normalized_gain_se":0.012322221908448767,"cost_mean":3.75295689,"cost_se":0.09699363949936017,"latency_mean":18.6206684,"latency_se":0.4700113214532816,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":29.12167999999999,"reward_se":3.0110064211821217,"baseline_reward":19.7597,"reference_reward":90.0,"gain_mean":9.361979999999996,"gain_se":3.0110064211821217,"normalized_reward_mean":0.13326898161989764,"normalized_reward_se":0.04286801379834738,"normalized_gain_mean":0.13328502298538009,"normalized_gain_se":0.04286722040170844,"cost_mean":2.7960348,"cost_se":0.5266636131397725,"latency_mean":18.1229512,"latency_se":3.014098572889238,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":7.109999999999999,"reward_se":0.6338670996352471,"baseline_reward":8.425,"reference_reward":19.0,"gain_mean":-1.3150000000000008,"gain_se":0.6338670996352471,"normalized_reward_mean":-0.24502617801047114,"normalized_reward_se":0.0663735182864133,"normalized_gain_mean":-0.1243498817966904,"normalized_gain_se":0.05994015126574441,"cost_mean":3.50625184,"cost_se":0.05082792736436932,"latency_mean":4.287941399999999,"latency_se":0.03822841799708699,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.32662,"reward_se":0.08612120180304034,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain_mean":-0.36587999999999987,"gain_se":0.08612120180304034,"normalized_reward_mean":-0.2968962543453328,"normalized_reward_se":0.03828391662430556,"normalized_gain_mean":-0.14339575315299774,"normalized_gain_se":0.03375263636981601,"cost_mean":1.5313784,"cost_se":0.06077166332889036,"latency_mean":6.453574000000001,"latency_se":0.2535837527317158,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":8.52,"reward_se":0.6509480265991547,"baseline_reward":4.2,"reference_reward":40.0,"gain_mean":4.32,"gain_se":0.6509480265991547,"normalized_reward_mean":0.08665377176015478,"normalized_reward_se":0.01888630638101996,"normalized_gain_mean":0.12067039106145253,"normalized_gain_se":0.01818290577092611,"cost_mean":2.632269,"cost_se":0.1582454473175642,"latency_mean":4.502319,"latency_se":0.06594865731536917,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":53.5,"reward_se":6.683935966180406,"baseline_reward":36.5,"reference_reward":1138.5,"gain_mean":17.0,"gain_se":6.683935966180406,"normalized_reward_mean":-0.07970942382326598,"normalized_reward_se":0.006651344378724655,"normalized_gain_mean":0.015426497277676952,"normalized_gain_se":0.006065277646261711,"cost_mean":1.3261488,"cost_se":0.022974193710335097,"latency_mean":5.3510864,"latency_se":0.05250344337526828,"warnings":""},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":8.10796,"reward_se":0.3497204449270876,"baseline_reward":3.1562,"reference_reward":12.0,"gain_mean":4.95176,"gain_se":0.3497204449270876,"normalized_reward_mean":0.3921251971824387,"normalized_reward_se":0.054620776379822204,"normalized_gain_mean":0.5599131595015717,"normalized_gain_se":0.03954413769274379,"cost_mean":1.5254393199999998,"cost_se":0.1105218753624928,"latency_mean":9.1115922,"latency_se":0.5857749702287901,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":31.91484000000002,"reward_se":2.1217473767157133,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain_mean":12.153440000000018,"gain_se":2.1217473767157133,"normalized_reward_mean":0.17303549310212296,"normalized_reward_se":0.030207539639170736,"normalized_gain_mean":0.1730307836431822,"normalized_gain_se":0.030207711667312755,"cost_mean":1.0235845,"cost_se":0.05345542888940281,"latency_mean":7.356661,"latency_se":0.2634685723981136,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":8.860000000000001,"reward_se":0.6853192686624243,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain_mean":0.6100000000000028,"gain_se":0.6853192686624243,"normalized_reward_mean":-0.06178010471204169,"normalized_reward_se":0.07176117996465176,"normalized_gain_mean":0.05674418604651188,"normalized_gain_se":0.0637506296430162,"cost_mean":3.4942101,"cost_se":0.19460902604314112,"latency_mean":9.574368400000001,"latency_se":0.31655509153836076,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.47565999999999997,"reward_se":0.045543864570323854,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain_mean":-0.8999400000000002,"gain_se":0.04554386457032384,"normalized_reward_mean":-0.23064270917609822,"normalized_reward_se":0.020245856739744057,"normalized_gain_mean":-0.4816531437991053,"normalized_gain_se":0.024375342301772518,"cost_mean":4.4385547,"cost_se":0.05137319122791568,"latency_mean":10.543479600000001,"latency_se":0.3564430818500479,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":12.373333333333333,"reward_se":1.1580827066991177,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain_mean":6.373333333333335,"gain_se":1.158082706699118,"normalized_reward_mean":0.19845261121856866,"normalized_reward_se":0.03360007853092217,"normalized_gain_mean":0.1874509803921569,"normalized_gain_se":0.03406125607938582,"cost_mean":1.4501504,"cost_se":0.06801183465368363,"latency_mean":3.5376958000000003,"latency_se":0.09406373289020589,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":81.34,"reward_se":15.206104037523879,"baseline_reward":81.1,"reference_reward":1138.5,"gain_mean":0.24000000000000626,"gain_se":15.206104037523879,"normalized_reward_mean":-0.0520051746442432,"normalized_reward_se":0.015131957446038291,"normalized_gain_mean":0.00022697181766598,"normalized_gain_se":0.014380654470894531,"cost_mean":1.5183047,"cost_se":0.04176319037789137,"latency_mean":4.3637582,"latency_se":0.06867228717568674,"warnings":""},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":8.4855,"reward_se":0.3170361982487176,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain_mean":4.002800000000002,"gain_se":0.3170361982487176,"normalized_reward_mean":0.4510909460071534,"normalized_reward_se":0.04951601640694044,"normalized_gain_mean":0.5324784164527159,"normalized_gain_se":0.04217421125253981,"cost_mean":2.3523074,"cost_se":0.10346429705451538,"latency_mean":13.430652,"latency_se":1.0201454536392838,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","n_runs":5,"n_instances":90,"reward_mean":33.79351999999999,"reward_se":2.9861827511724663,"baseline_reward":19.7601,"reference_reward":90.0,"gain_mean":14.033419999999984,"gain_se":2.9861827511724663,"normalized_reward_mean":0.19978245703953618,"normalized_reward_se":0.04251459660832965,"normalized_gain_mean":0.1997927104110339,"normalized_gain_se":0.04251405185902124,"cost_mean":1.3890744,"cost_se":0.07433766140799697,"latency_mean":5.742049,"latency_se":0.25614974117008193,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","n_runs":5,"n_instances":19,"reward_mean":11.105,"reward_se":1.1002954148772954,"baseline_reward":8.125,"reference_reward":19.0,"gain_mean":2.979999999999999,"gain_se":1.1002954148772954,"normalized_reward_mean":0.1732984293193717,"normalized_reward_se":0.11521417956830317,"normalized_gain_mean":0.2740229885057471,"normalized_gain_se":0.10117658987377429,"cost_mean":2.6241722,"cost_se":0.1610438971701194,"latency_mean":5.1777718,"latency_se":0.2356629131930606,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","n_runs":5,"n_instances":20,"reward_mean":0.7782,"reward_se":0.3151897983755185,"baseline_reward":0.8448,"reference_reward":3.24404,"gain_mean":-0.06659999999999999,"gain_se":0.3151897983755185,"normalized_reward_mean":-0.09615299127821686,"normalized_reward_se":0.14011300015804054,"normalized_gain_mean":-0.027758790283589806,"normalized_gain_se":0.13137068337286747,"cost_mean":5.9987073,"cost_se":0.5500077826915407,"latency_mean":9.0789544,"latency_se":0.7549990887757815,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"database_exploration","n_runs":5,"n_instances":40,"reward_mean":17.240000000000002,"reward_se":1.3003589248101215,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain_mean":12.906666666666672,"gain_se":1.3003589248101217,"normalized_reward_mean":0.33965183752417805,"normalized_reward_se":0.03772801522659927,"normalized_gain_mean":0.3618691588785048,"normalized_gain_se":0.03645866144327445,"cost_mean":1.9689827000000002,"cost_se":0.10415860059551009,"latency_mean":3.0177456,"latency_se":0.03601414614925641,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","n_runs":5,"n_instances":120,"reward_mean":73.41999999999999,"reward_se":41.547784537806585,"baseline_reward":90.4,"reference_reward":1138.5,"gain_mean":-16.98000000000001,"gain_se":41.547784537806585,"normalized_reward_mean":-0.059886555876206594,"normalized_reward_se":0.04134519309165746,"normalized_gain_mean":-0.01620074420379736,"normalized_gain_se":0.03964105003130101,"cost_mean":3.5733655,"cost_se":0.11362767348031907,"latency_mean":4.6700327999999995,"latency_se":0.1186347473666127,"warnings":""},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","n_runs":5,"n_instances":12,"reward_mean":7.8148,"reward_se":0.19769953970609008,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain_mean":3.0643000000000016,"gain_se":0.19769953970609008,"normalized_reward_mean":0.34633826354506697,"normalized_reward_se":0.03087752662253269,"normalized_gain_mean":0.42269122008414384,"normalized_gain_se":0.027270782772065665,"cost_mean":2.7885073,"cost_se":0.2917587954943518,"latency_mean":10.0952168,"latency_se":0.42886253085684684,"warnings":""}],"run_points":[{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","run_index":0,"reward":19.8113,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":0.05029999999999646,"normalized_reward":0.0007161263685416429,"normalized_gain":0.0007161263685416429,"cost_usd":3.75426,"latency_seconds":4.086263,"instance_count":90,"reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2092,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2906,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0020000000000000018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04920000000000002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0008999999999999841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"cost_curve":[0.0213875,0.0241775,0.0256925,0.0286675,0.023215,0.0217075,0.025675,0.0219375,0.0246,0.023925,0.02452,0.02337,0.0226625,0.0253375,0.0253475,0.02693,0.024005,0.0276825,0.026965,0.031415,0.025745,0.0285675,0.026665,0.0283275,0.0324675,0.030515,0.03082,0.0309575,0.034075,0.0369475,0.0357175,0.0349375,0.0342575,0.0347025,0.03569,0.0375,0.0366575,0.0433325,0.0376675,0.04058,0.0397425,0.036805,0.040735,0.0398975,0.0464925,0.0421925,0.04058,0.04228,0.0390975,0.039805,0.0440325,0.04088,0.0429575,0.04185,0.049575,0.052585,0.04828,0.0483975,0.04897,0.0495575,0.0524625,0.04999,0.0541075,0.05416,0.0504075,0.05103,0.05125,0.0497,0.05414,0.05445,0.054565,0.0559575,0.0550825,0.0562675,0.0559475,0.054485,0.05197,0.054,0.058965,0.0584,0.056295,0.0561575,0.0602775,0.0594775,0.0646675,0.0680625,0.0708725,0.0624525,0.0625825,0.066015]},{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","run_index":1,"reward":19.776399999999995,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":0.01539999999999253,"normalized_reward":0.00021925141303253933,"normalized_gain":0.00021925141303253933,"cost_usd":4.097735,"latency_seconds":3.987843,"instance_count":90,"reward_curve":[0.2072,0.2083,0.2333,0.2105,0.1942,0.195,0.2193,0.227,0.2094,0.2404,0.2273,0.2221,0.2018,0.2128,0.2363,0.2483,0.2239,0.2203,0.1974,0.2474,0.2094,0.2264,0.2019,0.2285,0.192,0.2241,0.2244,0.213,0.2126,0.2482,0.2264,0.2414,0.2027,0.2075,0.1926,0.2021,0.2177,0.2361,0.2273,0.2252,0.2309,0.2216,0.2215,0.1982,0.2203,0.2085,0.2095,0.2127,0.2156,0.2333,0.1973,0.1955,0.2287,0.2397,0.2246,0.2235,0.2307,0.1998,0.2215,0.2139,0.1997,0.2552,0.2461,0.2017,0.2272,0.221,0.263,0.2303,0.2377,0.2193,0.2212,0.2129,0.2488,0.2135,0.2442,0.2088,0.2114,0.2293,0.2166,0.1981,0.2155,0.2208,0.2166,0.2176,0.2353,0.2031,0.2541,0.2101,0.2079,0.2004],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.03989999999999999,0.021600000000000008,-0.015899999999999997,-0.029899999999999982,-0.017799999999999983,-0.008000000000000007,0.032,-0.012699999999999989,0.02779999999999999,-0.0131,-0.006400000000000017,-0.017499999999999988,-0.035500000000000004,0.044300000000000006,0.0509,0.0,-0.0067000000000000115,-0.009099999999999997,0.0,0.007599999999999996,0.024499999999999994,-0.011099999999999999,0.020199999999999996,-0.032399999999999984,-0.009200000000000014,0.014999999999999986,0.0025000000000000022,-0.018599999999999978,0.04100000000000001,0.028200000000000003,0.03290000000000001,-0.0068000000000000005,0.004799999999999999,-0.03090000000000001,-0.011800000000000005,0.014800000000000008,-0.005299999999999999,0.03,0.0049000000000000155,0.004500000000000004,0.028999999999999998,-0.018199999999999994,-0.023400000000000004,-0.007000000000000006,-0.0189,-0.01200000000000001,-0.018199999999999994,-0.018599999999999978,0.004600000000000021,-0.0204,-0.025999999999999995,0.021199999999999997,0.026999999999999996,0.0,-0.001700000000000007,0.030899999999999983,-0.0363,0.025999999999999995,-0.001700000000000007,-0.042200000000000015,0.04379999999999998,0.029500000000000026,-0.01930000000000001,0.029100000000000015,0.005500000000000005,0.0358,-0.024899999999999978,0.02889999999999998,-0.0019000000000000128,-0.032899999999999985,-0.0010000000000000009,0.0015999999999999903,-0.01680000000000001,0.023400000000000004,-0.02889999999999998,-0.030799999999999994,0.016399999999999998,-0.032200000000000006,-0.0015999999999999903,0.007599999999999996,0.0032000000000000084,0.0,0.007499999999999979,0.016000000000000014,0.002700000000000008,0.05449999999999999,0.008400000000000019,-0.0363,-0.021800000000000014],"cost_curve":[0.028115,0.02778,0.026105,0.029165,0.0298825,0.028555,0.029965,0.03381,0.0337375,0.03191,0.0347725,0.0348975,0.02991,0.0277025,0.0376025,0.0340825,0.0332875,0.033355,0.037035,0.0343425,0.0356425,0.04049,0.0359225,0.03876,0.0377025,0.03591,0.03837,0.035115,0.03998,0.03863,0.0401175,0.0410725,0.03952,0.03794,0.036985,0.04132,0.0423625,0.042565,0.0468225,0.0494375,0.0456025,0.0428325,0.0451125,0.047125,0.049725,0.0444925,0.047925,0.04866,0.0443275,0.0507775,0.0481625,0.046035,0.04594,0.0505,0.0543,0.0477075,0.051585,0.0498225,0.0505175,0.048005,0.04891,0.050085,0.0579725,0.051675,0.050005,0.051665,0.057605,0.05726,0.0555575,0.0622425,0.055935,0.0522675,0.056215,0.06591,0.053775,0.0571975,0.059655,0.062745,0.0544025,0.0547675,0.0581175,0.0550925,0.058865,0.0578175,0.061275,0.0651775,0.0565975,0.0603675,0.0592025,0.0595425]},{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","run_index":2,"reward":19.7827,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":0.02169999999999561,"normalized_reward":0.0003089451729095746,"normalized_gain":0.0003089451729095746,"cost_usd":4.7296,"latency_seconds":4.09146,"instance_count":90,"reward_curve":[0.2482,0.2474,0.2312,0.192,0.2072,0.2105,0.2117,0.1974,0.2019,0.2273,0.2404,0.2241,0.2285,0.2221,0.2083,0.2244,0.2065,0.2018,0.2333,0.2094,0.2203,0.2126,0.195,0.2193,0.2128,0.2264,0.213,0.227,0.2239,0.2483,0.2127,0.2287,0.1998,0.2273,0.2215,0.2029,0.2139,0.2246,0.1982,0.2309,0.2235,0.2361,0.2216,0.1955,0.2414,0.2274,0.2075,0.2397,0.1926,0.2085,0.2203,0.2177,0.2252,0.2264,0.1973,0.2156,0.2333,0.2095,0.2215,0.2027,0.2272,0.2442,0.263,0.2303,0.2101,0.2176,0.2166,0.2166,0.2541,0.2422,0.2193,0.2212,0.2069,0.221,0.2419,0.2155,0.2088,0.2114,0.2222,0.2004,0.2017,0.2552,0.1981,0.2129,0.2488,0.2208,0.2079,0.2377,0.1996,0.2135],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,-0.0007999999999999952,0.01949999999999999,-0.034399999999999986,-0.0169,-0.0022999999999999965,-0.015600000000000003,0.0023999999999999855,-0.020199999999999996,0.01469999999999999,0.0,-0.004400000000000015,0.009200000000000014,-0.0262,0.01630000000000001,0.026999999999999996,-0.0174,-0.0252,0.026800000000000018,-0.038000000000000006,0.01849999999999999,0.010700000000000015,-0.017999999999999988,0.010999999999999982,-0.0116,-0.006900000000000017,0.003599999999999992,0.016500000000000015,-0.007300000000000001,0.0411,0.014500000000000013,0.020199999999999996,-0.009699999999999986,0.02460000000000001,-0.0020000000000000018,-0.01100000000000001,0.01100000000000001,-0.01680000000000001,0.0008999999999999841,0.010599999999999998,-0.002899999999999986,0.04350000000000001,-0.018100000000000005,-0.026099999999999984,0.014100000000000001,0.0,-0.014000000000000012,0.008800000000000002,-0.0416,-0.020199999999999996,0.002599999999999991,-0.003799999999999998,0.01770000000000002,0.01369999999999999,-0.02729999999999999,-0.009599999999999997,0.0335,-0.026600000000000013,0.025999999999999995,-0.012900000000000023,-0.01469999999999999,0.032799999999999996,0.046400000000000025,0.009300000000000003,0.01200000000000001,0.0020999999999999908,-0.010600000000000026,-0.038599999999999995,0.04529999999999998,0.02099999999999999,-0.0348,0.007300000000000001,-0.0403,-0.009300000000000003,0.021100000000000008,-0.022199999999999998,-0.033399999999999985,-0.0015000000000000013,-0.026599999999999985,0.0007000000000000062,-0.006200000000000011,0.037599999999999995,-0.01849999999999999,0.002799999999999997,0.0295,0.0204,0.008300000000000002,0.036000000000000004,-0.0446,-0.008700000000000013],"cost_curve":[0.0251325,0.0316375,0.029035,0.03056,0.0322275,0.0311475,0.031175,0.0338825,0.0325975,0.032375,0.032735,0.031495,0.032975,0.0355125,0.0360375,0.0363825,0.034865,0.03441,0.039915,0.03949,0.0407725,0.04014,0.036505,0.0386075,0.04169,0.04739,0.0408075,0.0426975,0.043165,0.04971,0.0441625,0.0467975,0.0461275,0.049225,0.045515,0.0480575,0.048505,0.050245,0.05402,0.0517775,0.049205,0.0503675,0.0525825,0.0515625,0.0527475,0.056075,0.05411,0.051715,0.05105,0.05655,0.0569875,0.0529275,0.06132,0.0570975,0.0604725,0.0571975,0.0609125,0.05897,0.0594275,0.0567275,0.05877,0.0589425,0.06744,0.0627475,0.0632175,0.06402,0.06737,0.06461,0.0697875,0.068565,0.0720575,0.065315,0.0712025,0.064125,0.0716175,0.070875,0.068605,0.06729,0.0712725,0.0668875,0.06851,0.0678175,0.0699425,0.0664325,0.068975,0.073305,0.0713375,0.0746075,0.0768125,0.0796325]},{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","run_index":3,"reward":19.7597,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":-0.0013000000000040757,"normalized_reward":-1.850823616515149e-05,"normalized_gain":-1.850823616515149e-05,"cost_usd":2.8131975,"latency_seconds":3.709048,"instance_count":90,"reward_curve":[0.192,0.2239,0.2094,0.195,0.2193,0.2128,0.2273,0.2241,0.2019,0.2482,0.2018,0.2474,0.2083,0.2285,0.2312,0.227,0.2065,0.2244,0.2264,0.213,0.2105,0.2333,0.2203,0.2404,0.2126,0.2117,0.2072,0.1974,0.2483,0.2221,0.2274,0.2333,0.2127,0.2309,0.2273,0.1955,0.2414,0.1998,0.2203,0.2177,0.2029,0.2215,0.2252,0.2397,0.1982,0.2156,0.2361,0.2027,0.2215,0.1926,0.1973,0.2264,0.2139,0.2246,0.2085,0.2075,0.2235,0.2216,0.2287,0.2095,0.2155,0.221,0.2541,0.2208,0.2101,0.2129,0.2442,0.1996,0.2488,0.2222,0.2114,0.2088,0.2193,0.2176,0.2166,0.2422,0.2472,0.2017,0.2272,0.2212,0.2303,0.1997,0.2166,0.2377,0.2552,0.2419,0.2004,0.2079,0.1981,0.2135],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.024300000000000016,-0.0022999999999999965,-0.031399999999999983,-0.004799999999999999,0.0,0.0,0.029099999999999987,-0.020199999999999996,0.03559999999999999,-0.038599999999999995,0.0189,-0.010999999999999982,-0.019799999999999984,0.039199999999999985,0.029600000000000015,-0.0174,-0.002600000000000019,0.0199,-0.034400000000000014,0.008699999999999986,0.03140000000000001,0.007300000000000001,0.03209999999999999,-0.011799999999999977,-0.021600000000000008,-0.0022000000000000075,-0.0131,0.017100000000000004,0.014899999999999997,0.029200000000000004,0.024800000000000016,0.0032000000000000084,0.028200000000000003,0.003799999999999998,-0.0184,0.038500000000000006,-0.0416,0.022999999999999993,-0.002599999999999991,-0.023499999999999993,0.02890000000000001,-0.014499999999999985,0.018100000000000005,-0.029100000000000015,-0.011799999999999977,0.014600000000000002,-0.028200000000000003,-0.012699999999999989,-0.03609999999999999,-0.0204,0.004899999999999988,0.006400000000000017,0.011899999999999994,-0.016100000000000003,-0.01770000000000002,0.0237,-0.014500000000000013,0.03319999999999998,-0.006100000000000022,-0.026400000000000007,0.009599999999999997,0.037500000000000006,-0.00020000000000000573,0.01200000000000001,-0.002599999999999991,0.016999999999999987,-0.05559999999999998,0.03999999999999998,0.0010000000000000009,-0.04269999999999999,-0.005099999999999993,-0.027900000000000008,-0.012700000000000017,-0.004200000000000009,0.004500000000000004,0.0050000000000000044,-0.011200000000000015,-0.02159999999999998,0.02150000000000002,0.022400000000000003,-0.0179,0.0,0.027599999999999986,0.03589999999999999,0.04150000000000001,0.0007999999999999952,0.006200000000000011,-0.0461,-0.008700000000000013],"cost_curve":[0.0258675,0.027245,0.027515,0.025395,0.0240525,0.025475,0.0291175,0.0291425,0.02902,0.028175,0.025805,0.0284925,0.0302125,0.02861,0.0313425,0.0289075,0.0276575,0.0301225,0.0337325,0.0264925,0.0286725,0.0297225,0.0287575,0.028995,0.03051,0.0308725,0.0331475,0.033295,0.02949,0.0302125,0.0314225,0.03599,0.0296025,0.0315,0.03732,0.02856,0.0314875,0.032565,0.0318,0.029135,0.0306475,0.0315925,0.0385025,0.03044,0.031235,0.0315125,0.0290975,0.0308325,0.0292525,0.0292075,0.0326425,0.03152,0.0309375,0.03359,0.0296375,0.02954,0.0292375,0.0309275,0.02959,0.0308475,0.03132,0.0309675,0.0350675,0.030575,0.0293925,0.0307425,0.0321825,0.0371125,0.032295,0.0343125,0.0315925,0.0325875,0.038965,0.03264,0.03442,0.03454,0.0367625,0.0307725,0.03287,0.0342275,0.03562,0.0326325,0.03353,0.0355575,0.0338775,0.035425,0.0323025,0.03451,0.0312525,0.0389175]},{"run_name":"ace-gpt-5.4","task":"blind_spectrum_monitoring","run_index":4,"reward":19.759699999999995,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":-0.0013000000000076284,"normalized_reward":-1.8508236165202072e-05,"normalized_gain":-1.8508236165202072e-05,"cost_usd":4.3816625,"latency_seconds":4.057216,"instance_count":90,"reward_curve":[0.2273,0.2474,0.2483,0.2264,0.192,0.2126,0.2083,0.213,0.2221,0.1974,0.2285,0.2019,0.2404,0.2128,0.2105,0.2072,0.2203,0.2094,0.227,0.2244,0.2241,0.2239,0.2018,0.195,0.2333,0.2065,0.2482,0.2193,0.2117,0.2312,0.1973,0.2215,0.1982,0.2029,0.2177,0.2333,0.2361,0.2215,0.2139,0.2203,0.2085,0.2252,0.1955,0.2156,0.2397,0.2246,0.2264,0.2414,0.2216,0.2027,0.2287,0.2309,0.2075,0.2127,0.1998,0.2095,0.2273,0.2274,0.2235,0.1926,0.2114,0.2208,0.1981,0.2303,0.2004,0.2166,0.2419,0.2129,0.2176,0.2422,0.2101,0.2079,0.2472,0.2088,0.2377,0.2155,0.2442,0.1996,0.2222,0.2166,0.2541,0.1997,0.2552,0.2488,0.221,0.2272,0.2017,0.2193,0.2135,0.2212],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,-0.0007999999999999952,0.036599999999999994,0.0,-0.03209999999999999,-0.00019999999999997797,-0.01899999999999999,0.017999999999999988,0.0,-0.015200000000000019,-0.011899999999999994,-0.026600000000000013,0.021100000000000008,-0.035500000000000004,0.01849999999999999,0.009800000000000003,-0.003599999999999992,-0.017600000000000005,0.020500000000000018,-0.02300000000000002,0.022299999999999986,0.021999999999999992,-0.011199999999999988,-0.013300000000000006,0.008900000000000019,-0.026800000000000018,0.0388,0.008800000000000002,-0.01949999999999999,0.023999999999999994,-0.0008999999999999841,0.013000000000000012,-0.011300000000000004,0.00020000000000000573,-0.0058,0.0194,0.03320000000000001,-0.0199,0.016600000000000004,0.0,-0.0179,0.03260000000000002,-0.04419999999999999,-0.005999999999999978,0.012399999999999994,-0.002799999999999997,0.004899999999999988,0.01050000000000001,-0.0126,-0.025999999999999995,0.010999999999999982,0.009399999999999992,0.0,0.0,-0.02479999999999999,-0.01570000000000002,0.027499999999999997,-0.008700000000000013,0.027999999999999997,-0.02300000000000002,-0.0305,0.009399999999999992,-0.01849999999999999,0.009300000000000003,0.0022999999999999965,0.0010999999999999899,0.01469999999999999,-0.042299999999999977,0.008799999999999975,0.02099999999999999,-0.043999999999999984,-0.006000000000000005,0.0,-0.02149999999999999,0.0169,-0.022199999999999998,0.0020000000000000018,-0.013300000000000006,-0.026599999999999985,0.0169,0.04619999999999999,-0.0179,0.038599999999999995,0.038699999999999984,0.001700000000000007,0.026800000000000018,0.0020999999999999908,0.017600000000000005,-0.030700000000000005,-0.0010000000000000009],"cost_curve":[0.0259775,0.0300475,0.0304975,0.0327425,0.0292875,0.0300475,0.0298075,0.0292975,0.031865,0.037045,0.0329625,0.0334775,0.0331675,0.0334875,0.03402,0.0378875,0.032965,0.0349475,0.037905,0.0367625,0.03337,0.036255,0.03634,0.0342625,0.03745,0.0378225,0.0370675,0.03644,0.0386825,0.0422075,0.038675,0.04043,0.0412175,0.0432575,0.044075,0.0464625,0.0467425,0.04784,0.044445,0.045775,0.04625,0.050925,0.0448775,0.04875,0.0513325,0.0550175,0.049505,0.05113,0.0500875,0.0487025,0.0481675,0.0514775,0.047545,0.04941,0.052215,0.0530925,0.0589275,0.0536475,0.0543475,0.05411,0.0562525,0.054895,0.05603,0.05991,0.058245,0.057345,0.0634,0.0580475,0.061345,0.060685,0.05774,0.0584425,0.06327,0.065525,0.0627475,0.0595225,0.0646075,0.068575,0.0657175,0.063185,0.0665,0.0666075,0.06487,0.0695925,0.0655725,0.0693175,0.0652025,0.07399,0.072035,0.069955]},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","run_index":0,"reward":11.449999999999998,"baseline_reward":10.0,"reference_reward":19.0,"gain":1.4499999999999975,"normalized_reward":0.2094240837696333,"normalized_gain":0.16111111111111084,"cost_usd":18.20286,"latency_seconds":3.724305,"instance_count":19,"reward_curve":[0.8,0.7,0.825,0.725,0.8,0.475,0.7,0.2,0.475,0.525,0.725,0.625,0.875,0.8,0.325,0.2,0.825,0.0,0.85],"baseline_reward_curve":[0.65,0.625,0.425,0.0,0.75,0.525,0.525,0.175,0.325,0.65,0.725,0.75,0.625,0.525,0.45,0.4,0.725,0.275,0.875],"gain_curve":[0.15000000000000002,0.07499999999999996,0.39999999999999997,0.725,0.050000000000000044,-0.050000000000000044,0.17499999999999993,0.025000000000000022,0.14999999999999997,-0.125,0.0,-0.125,0.25,0.275,-0.125,-0.2,0.09999999999999998,-0.275,-0.025000000000000022],"cost_curve":[0.1973125,0.3751675,0.23127,0.3910525,0.1707625,1.32047,0.508395,1.95273,0.89406,1.54695,0.32118,0.8948925,0.1482475,0.3307825,1.91931,3.19867,0.2452275,3.3396275,0.2167525]},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","run_index":1,"reward":11.749999999999998,"baseline_reward":10.0,"reference_reward":19.0,"gain":1.7499999999999982,"normalized_reward":0.2408376963350784,"normalized_gain":0.19444444444444425,"cost_usd":15.9647825,"latency_seconds":3.847805,"instance_count":19,"reward_curve":[0.625,0.65,0.675,0.525,0.525,0.775,0.2,0.825,0.725,0.0,0.85,0.0,0.65,0.65,0.9,0.85,0.7,0.825,0.8],"baseline_reward_curve":[0.65,0.625,0.425,0.0,0.75,0.525,0.525,0.175,0.325,0.65,0.725,0.75,0.625,0.525,0.45,0.4,0.725,0.275,0.875],"gain_curve":[-0.025000000000000022,0.025000000000000022,0.25000000000000006,0.525,-0.22499999999999998,0.25,-0.325,0.6499999999999999,0.39999999999999997,-0.65,0.125,-0.75,0.025000000000000022,0.125,0.45,0.44999999999999996,-0.025000000000000022,0.5499999999999999,-0.07499999999999996],"cost_curve":[0.4231475,0.65407,0.6969925,0.901665,0.7092425,0.3234975,2.804285,0.21563,0.38099,3.969645,0.20127,1.1919675,0.79248,1.0257575,0.11495,0.26221,0.744925,0.2749475,0.27711]},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","run_index":2,"reward":12.2,"baseline_reward":10.0,"reference_reward":19.0,"gain":2.1999999999999993,"normalized_reward":0.28795811518324604,"normalized_gain":0.24444444444444435,"cost_usd":13.325535,"latency_seconds":3.811789,"instance_count":19,"reward_curve":[0.7,0.15,0.8,0.75,0.4,0.8,0.0,0.75,0.825,0.45,0.275,0.4,0.85,0.775,0.825,0.875,0.875,0.825,0.875],"baseline_reward_curve":[0.65,0.625,0.425,0.0,0.75,0.525,0.525,0.175,0.325,0.65,0.725,0.75,0.625,0.525,0.45,0.4,0.725,0.275,0.875],"gain_curve":[0.04999999999999993,-0.475,0.37500000000000006,0.75,-0.35,0.275,-0.525,0.575,0.49999999999999994,-0.2,-0.44999999999999996,-0.35,0.22499999999999998,0.25,0.37499999999999994,0.475,0.15000000000000002,0.5499999999999999,0.0],"cost_curve":[0.27381,1.9612625,0.1935175,0.34662,1.141805,0.2484475,1.053765,0.430675,0.21163,1.6132325,2.29939,1.7594225,0.1634625,0.5123525,0.2557875,0.1816,0.175725,0.30725,0.19578]},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","run_index":3,"reward":11.200000000000003,"baseline_reward":10.0,"reference_reward":19.0,"gain":1.2000000000000028,"normalized_reward":0.18324607329842968,"normalized_gain":0.13333333333333364,"cost_usd":15.824155,"latency_seconds":3.899352,"instance_count":19,"reward_curve":[0.0,0.45,0.65,0.65,0.7,0.825,0.7,0.875,0.775,0.75,0.575,0.9,0.0,0.125,0.775,0.0,0.65,0.9,0.9],"baseline_reward_curve":[0.65,0.625,0.425,0.0,0.75,0.525,0.525,0.175,0.325,0.65,0.725,0.75,0.625,0.525,0.45,0.4,0.725,0.275,0.875],"gain_curve":[-0.65,-0.175,0.22500000000000003,0.65,-0.050000000000000044,0.29999999999999993,0.17499999999999993,0.7,0.45,0.09999999999999998,-0.15000000000000002,0.15000000000000002,-0.625,-0.4,0.325,-0.4,-0.07499999999999996,0.625,0.025000000000000022],"cost_curve":[0.8781125,1.2419225,0.36141,0.7554,0.34453,0.2222325,0.3545325,0.185835,0.2784275,0.38448,0.79139,0.11512,3.58871,3.185215,0.429035,1.48531,0.92025,0.14958,0.1526625]},{"run_name":"ace-gpt-5.4","task":"codebase_adaptation","run_index":4,"reward":11.299999999999999,"baseline_reward":10.0,"reference_reward":19.0,"gain":1.299999999999999,"normalized_reward":0.19371727748691095,"normalized_gain":0.14444444444444432,"cost_usd":14.980735,"latency_seconds":3.841247,"instance_count":19,"reward_curve":[0.625,0.675,0.45,0.625,0.325,0.55,0.575,0.85,0.85,0.825,0.6,0.75,0.0,0.65,0.525,0.825,0.55,0.575,0.475],"baseline_reward_curve":[0.65,0.625,0.425,0.0,0.75,0.525,0.525,0.175,0.325,0.65,0.725,0.75,0.625,0.525,0.45,0.4,0.725,0.275,0.875],"gain_curve":[-0.025000000000000022,0.050000000000000044,0.025000000000000022,0.625,-0.425,0.025000000000000022,0.04999999999999993,0.675,0.5249999999999999,0.17499999999999993,-0.125,0.0,-0.625,0.125,0.07500000000000001,0.42499999999999993,-0.17499999999999993,0.29999999999999993,-0.4],"cost_curve":[0.27349,0.5606825,1.0441575,0.59231,1.8016925,0.4071775,0.634895,0.246685,0.2034225,0.1995225,0.7539975,0.259615,2.9793425,0.7390975,1.2446575,0.2198625,1.08698,1.0159675,0.71718]},{"run_name":"ace-gpt-5.4","task":"cohort_studies","run_index":0,"reward":0.9518,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain":0.5762999999999999,"normalized_reward":-0.018981658472398782,"normalized_gain":0.20090359555732182,"cost_usd":13.02221,"latency_seconds":6.560934,"instance_count":20,"reward_curve":[0.0,0.0347,0.0042,0.0014,0.0,0.0179,0.099,0.0,0.0,0.0119,0.0,0.0,0.0,0.1629,0.0335,0.2336,0.17,0.0388,0.0841,0.0598],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0669,0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1433,0.0,0.0,0.0],"gain_curve":[0.0,0.0347,0.0042,0.0014,0.0,-0.049,-0.0663,0.0,0.0,0.0119,0.0,0.0,0.0,0.1629,0.0335,0.2336,0.0267,0.0388,0.0841,0.0598],"cost_curve":[0.5559125,0.4130975,0.573035,0.4492875,0.74994,0.8148125,0.4798275,0.5463425,0.6915175,0.98767,0.8544775,0.89746,0.83872,0.76795,0.62447,0.5784525,0.4644625,0.4302575,0.6597275,0.64479]},{"run_name":"ace-gpt-5.4","task":"cohort_studies","run_index":1,"reward":0.3406,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain":-0.03490000000000004,"normalized_reward":-0.2906816504707629,"normalized_gain":-0.012166467959310326,"cost_usd":12.1852025,"latency_seconds":6.468656,"instance_count":20,"reward_curve":[0.0,0.0357,0.1045,0.0,0.0436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0733,0.0,0.0,0.0835,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0669,0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1433,0.0,0.0,0.0],"gain_curve":[0.0,0.0357,0.1045,0.0,0.0436,-0.0669,-0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0733,0.0,-0.1433,0.0835,0.0,0.0],"cost_curve":[0.53803,0.6790275,0.5967475,0.49109,0.3689875,0.463995,0.8222475,0.6348875,0.564485,0.693365,0.6192025,0.8148075,0.50574,0.772955,0.68181,0.6890675,0.345365,0.71635,0.5693975,0.617645]},{"run_name":"ace-gpt-5.4","task":"cohort_studies","run_index":2,"reward":1.0212,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain":0.6457,"normalized_reward":0.011869093236839606,"normalized_gain":0.22509708771709652,"cost_usd":15.18823,"latency_seconds":6.082932,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.1517,0.013,0.0165,0.0,0.0,0.0,0.014,0.0,0.0,0.1555,0.0839,0.0796,0.1507,0.1229,0.1024,0.0318,0.0992],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0669,0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1433,0.0,0.0,0.0],"gain_curve":[0.0,0.0,0.0,0.1517,0.013,-0.0504,-0.1653,0.0,0.0,0.014,0.0,0.0,0.1555,0.0839,0.0796,0.1507,-0.020400000000000015,0.1024,0.0318,0.0992],"cost_curve":[1.0202725,0.4592225,0.514125,0.5116575,0.48691,1.364015,0.69454,0.456595,0.931405,0.8588125,0.8099475,0.8977575,0.58586,0.8840825,0.6676675,0.8935575,0.8914575,0.8962675,0.605315,0.7587625]},{"run_name":"ace-gpt-5.4","task":"cohort_studies","run_index":3,"reward":0.5986,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain":0.22309999999999997,"normalized_reward":-0.17599153604736964,"normalized_gain":0.07777475649633611,"cost_usd":8.92332,"latency_seconds":6.3059,"instance_count":20,"reward_curve":[0.0,0.0199,0.0848,0.0101,0.0467,0.0,0.0,0.0318,0.0252,0.0506,0.0,0.004,0.1097,0.0,0.0,0.0,0.0029,0.0293,0.0441,0.1395],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0669,0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1433,0.0,0.0,0.0],"gain_curve":[0.0,0.0199,0.0848,0.0101,0.0467,-0.0669,-0.1653,0.0318,0.0252,0.0506,0.0,0.004,0.1097,0.0,0.0,0.0,-0.14040000000000002,0.0293,0.0441,0.1395],"cost_curve":[0.47593,0.249085,0.3480825,0.2418075,0.546925,0.42047,0.4439225,0.29819,0.4305825,0.65128,0.445185,0.5313275,0.3881825,0.544005,0.4604475,0.458435,0.397115,0.4194575,0.6369225,0.5359675]},{"run_name":"ace-gpt-5.4","task":"cohort_studies","run_index":4,"reward":0.8814000000000001,"baseline_reward":0.37550000000000006,"reference_reward":3.24404,"gain":0.5059,"normalized_reward":-0.05027694550885953,"normalized_gain":0.1763614940004323,"cost_usd":14.667345,"latency_seconds":6.251645,"instance_count":20,"reward_curve":[0.0,0.0162,0.0,0.0,0.0,0.0,0.1542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2126,0.2076,0.0,0.0854,0.0,0.2054],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0669,0.1653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1433,0.0,0.0,0.0],"gain_curve":[0.0,0.0162,0.0,0.0,0.0,-0.0669,-0.011099999999999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2126,0.2076,-0.1433,0.0854,0.0,0.2054],"cost_curve":[0.63771,0.7507175,0.55647,0.6876025,0.7367275,0.61099,0.44835,0.5807125,0.656275,0.9068475,0.52925,0.8876975,0.66264,0.6353375,0.788685,0.8241275,0.7146675,0.6612775,0.83709,1.55417]},{"run_name":"ace-gpt-5.4","task":"database_exploration","run_index":0,"reward":5.6,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain":0.13333333333333286,"normalized_reward":0.0019342359767891872,"normalized_gain":0.0038610038610038477,"cost_usd":10.070015,"latency_seconds":3.645631,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.06666666666666665,0.33333333333333337,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.6,0.0,0.33333333333333337,0.0,0.4666666666666667,0.4,0.0,0.6666666666666667,0.0,0.0,0.2666666666666667,0.5333333333333333,0.6,0.1333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.1333333333333333,0.0,0.6,0.0,0.4666666666666667,0.4,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.0,0.0,-0.7333333333333334,0.0,0.0,-0.1333333333333333,0.0,-0.19999999999999996,0.0,-0.4,-0.06666666666666665,0.0,-0.33333333333333337,0.0,0.0,0.0,0.0,-0.7333333333333334,0.0,0.0,0.0,0.0,-0.06666666666666665,0.0,0.6,0.0,0.33333333333333337,0.0,0.4666666666666667,-0.2666666666666667,0.0,0.6666666666666667,0.0,0.0,0.2666666666666667,0.5333333333333333,0.6,-0.4,0.0],"cost_curve":[0.104805,0.1669625,0.11185,0.19618,0.2041625,0.3480725,0.23634,0.3128175,0.1940125,0.1927675,0.32439,0.2373,0.3495675,0.203205,0.3448675,0.3608675,0.1785775,0.1249375,0.1960875,0.31791,0.314045,0.4082875,0.25067,0.242125,0.24689,0.1736775,0.25505,0.25884,0.222865,0.2293575,0.245445,0.41027,0.15074,0.2554025,0.2669975,0.3294175,0.211995,0.1899975,0.4230725,0.27919]},{"run_name":"ace-gpt-5.4","task":"database_exploration","run_index":1,"reward":7.999999999999999,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain":2.5333333333333323,"normalized_reward":0.07156673114119923,"normalized_gain":0.07335907335907334,"cost_usd":8.3304375,"latency_seconds":3.793318,"instance_count":40,"reward_curve":[0.2666666666666667,0.7333333333333334,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.4,0.7333333333333334,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.4,0.0,0.6666666666666667,0.5333333333333333,0.0,0.0,0.0,0.0,0.8,0.1333333333333333,0.19999999999999996],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.1333333333333333,0.0,0.6,0.0,0.4666666666666667,0.4,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0],"gain_curve":[0.2666666666666667,0.7333333333333334,0.6,-0.7333333333333334,0.0,0.0,-0.1333333333333333,0.6666666666666667,-0.6,0.0,-0.4666666666666667,-0.4,0.0,-0.33333333333333337,0.7333333333333334,0.0,0.6666666666666667,0.0,-0.7333333333333334,0.0,0.0,0.6,0.0,-0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.4,-0.6666666666666667,0.6666666666666667,0.5333333333333333,0.0,0.0,0.0,0.0,0.8,-0.4,0.19999999999999996],"cost_curve":[0.168925,0.0677,0.103305,0.1194975,0.236995,0.1474375,0.2042575,0.106155,0.1700625,0.1586775,0.14894,0.1984825,0.184585,0.2018475,0.095305,0.097215,0.1440225,0.3176025,0.1745225,0.2058375,0.3340575,0.1550525,0.2856775,0.2050675,0.1486,0.250015,0.4298,0.199565,0.229175,0.28027,0.3116175,0.147535,0.198555,0.2705075,0.282935,0.2141425,0.225985,0.1060475,0.4143825,0.3900775]},{"run_name":"ace-gpt-5.4","task":"database_exploration","run_index":2,"reward":12.733333333333333,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain":7.266666666666666,"normalized_reward":0.20889748549323017,"normalized_gain":0.21042471042471042,"cost_usd":6.4323125,"latency_seconds":3.717598,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.4666666666666667,0.4,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6,0.4666666666666667,0.0,0.6666666666666667,0.7333333333333334,0.5333333333333333,0.6,0.4666666666666667,0.0,0.0,0.6,0.5333333333333333,0.6,0.7333333333333334,0.5333333333333333,0.6666666666666667,0.0,0.0,0.4666666666666667,0.0,0.6666666666666667,0.0,0.6666666666666667],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.1333333333333333,0.0,0.6,0.0,0.4666666666666667,0.4,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.0,0.0,-0.2666666666666667,0.4,0.5333333333333333,-0.1333333333333333,0.0,-0.6,0.0,-0.4666666666666667,0.19999999999999996,0.0,-0.7333333333333334,0.6666666666666667,0.0,0.5333333333333333,0.6,-0.2666666666666667,0.0,0.6666666666666667,0.7333333333333334,0.5333333333333333,0.1333333333333333,0.4666666666666667,0.0,0.0,0.6,0.5333333333333333,0.6,0.06666666666666665,0.5333333333333333,0.6666666666666667,0.0,0.0,0.4666666666666667,0.0,0.6666666666666667,-0.5333333333333333,0.6666666666666667],"cost_curve":[0.06485,0.1169175,0.270775,0.149465,0.2061425,0.1350925,0.09588,0.14327,0.1465725,0.146935,0.101415,0.1498675,0.14277,0.1174225,0.1214725,0.1586725,0.15793,0.141625,0.2155675,0.2286525,0.124255,0.1019275,0.1695975,0.1672025,0.1959875,0.1837525,0.18806,0.169315,0.208505,0.206815,0.1172625,0.1871,0.1296675,0.1757175,0.2408775,0.238745,0.1779975,0.14807,0.1557525,0.13441]},{"run_name":"ace-gpt-5.4","task":"database_exploration","run_index":3,"reward":7.0,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain":1.5333333333333332,"normalized_reward":0.04255319148936173,"normalized_gain":0.0444015444015444,"cost_usd":10.3106175,"latency_seconds":3.692554,"instance_count":40,"reward_curve":[0.0,0.5333333333333333,0.0,0.0,0.4,0.5333333333333333,0.0,0.4,0.0,0.0,0.7333333333333334,0.33333333333333337,0.0,0.6,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.4,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.6,0.6666666666666667,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.1333333333333333,0.0,0.6,0.0,0.4666666666666667,0.4,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.5333333333333333,0.0,-0.7333333333333334,0.4,0.5333333333333333,-0.1333333333333333,0.4,-0.6,0.0,0.2666666666666667,-0.06666666666666665,0.0,-0.13333333333333341,0.0,0.0,0.0,0.0,-0.13333333333333341,0.0,0.0,0.0,0.4,-0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,-0.6666666666666667,0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.6,0.13333333333333341,0.0],"cost_curve":[0.1295075,0.1140875,0.2129625,0.2727425,0.1661375,0.1644775,0.13743,0.18427,0.22407,0.1707375,1.4164,0.224125,0.2284225,0.14162,0.1855275,0.2188475,0.2592525,0.2864975,0.1473025,0.1933175,0.248145,0.30293,0.243065,0.2392075,0.148035,0.2298175,0.20004,0.1989,0.3568125,0.2624575,0.4600875,0.2601625,0.2816775,0.2736275,0.2784925,0.32865,0.327025,0.1713,0.19931,0.22314]},{"run_name":"ace-gpt-5.4","task":"database_exploration","run_index":4,"reward":5.933333333333333,"baseline_reward":5.466666666666667,"reference_reward":40.0,"gain":0.4666666666666659,"normalized_reward":0.011605415860735019,"normalized_gain":0.013513513513513492,"cost_usd":8.7363375,"latency_seconds":3.674446,"instance_count":40,"reward_curve":[0.5333333333333333,0.0,0.0,0.0,0.0,0.8,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.5333333333333333,0.6,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.6,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.1333333333333333,0.0,0.6,0.0,0.4666666666666667,0.4,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0],"gain_curve":[0.5333333333333333,0.0,0.0,-0.7333333333333334,0.0,0.8,-0.1333333333333333,0.5333333333333333,-0.6,0.0,-0.4666666666666667,-0.4,0.0,-0.7333333333333334,0.0,0.6,0.0,0.0,-0.20000000000000007,0.6,0.0,0.0,0.0,-0.4666666666666667,0.4666666666666667,0.0,0.0,0.6,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,-0.5333333333333333,0.0],"cost_curve":[0.1063575,0.3368875,0.08088,0.08778,0.07659,0.0733,0.0815375,0.148275,0.194035,0.09703,0.13941,0.1549675,0.1179025,0.10655,0.210425,0.166765,0.203305,0.3014775,0.21073,0.1781275,0.165885,0.2669975,0.18706,0.377015,0.264295,0.2177775,0.281915,0.190195,0.178095,0.47441,0.13803,0.377035,0.186235,0.4445975,0.5301275,0.3287975,0.253785,0.26586,0.3391775,0.196715]},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","run_index":0,"reward":0.29999999999999716,"baseline_reward":141.9,"reference_reward":1138.5,"gain":-141.60000000000002,"normalized_reward":-0.1326500149268584,"normalized_gain":-0.1420830824804335,"cost_usd":15.7476225,"latency_seconds":4.983546,"instance_count":120,"reward_curve":[-1.0,1.0,-2.0,10.0,-1.0,1.0,4.0,-1.0,15.0,-39.0,1.0,-8.4,2.0,4.0,-9.0,-2.0,4.0,-1.0,5.5,5.2,2.0,-5.0,2.0,1.0,1.0,2.0,0.5,-0.5,0.5,-1.0,2.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,1.0,90.0,-1.0,3.0,-5.0,1.0,-2.0,-5.0,2.0,2.0,1.0,2.0,3.0,1.0,-6.0,-1.0,2.0,6.0,2.0,-1.0,-2.0,-1.0,1.0,0.5,0.5,-3.0,1.0,1.0,1.0,3.0,1.0,-4.5,2.0,7.0,0.5,-3.0,-2.0,1.0,-1.0,0.5,-1.0,2.0,-1.0,0.5,3.0,-1.0,1.0,-3.0,1.0,2.0,1.0,-2.0,0.5,-1.0,-3.0,-2.0,0.5,0.5,1.0,2.0,-1.0,1.0,0.5,2.0,-2.0,1.0,2.0,2.0,0.5,-100.0,2.0,3.0,2.0,2.0,2.0,-3.0,-4.0,2.0,3.0,2.0,0.5,-6.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-2.0,1.0,4.0,-1.0,14.0,-14.0,1.0,-17.0,2.0,7.0,-1.0,-2.5,4.0,-1.0,5.4,6.0,2.0,-5.0,2.0,-3.0,1.0,-1.0,0.5,-2.0,0.5,-1.0,0.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,0.0,-1.0,50.0,-1.0,3.0,-2.0,-1.0,2.0,-3.0,74.0,2.0,1.0,2.0,2.0,1.0,-4.0,-16.0,1.0,6.0,1.0,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,-2.0,2.4,1.0,-2.4,1.0,16.0,0.5,-2.0,-1.0,-4.0,-1.0,0.5,-1.0,4.0,-1.0,0.5,1.0,-1.0,3.0,-1.0,1.0,7.0,1.0,-2.0,0.5,-1.0,-2.0,-1.0,0.5,0.5,1.0,2.0,4.0,1.0,0.5,-2.0,-4.0,-1.0,-1.0,-2.0,0.5,-93.0,-2.0,3.0,2.0,-1.0,2.0,5.0,-4.0,-2.0,100.0,2.0,0.5,-4.0],"gain_curve":[0.0,0.0,-1.0,6.0,1.0,0.0,0.0,0.0,1.0,-25.0,0.0,8.6,0.0,-3.0,-8.0,0.5,0.0,0.0,0.09999999999999964,-0.7999999999999998,0.0,0.0,0.0,4.0,0.0,3.0,0.0,1.5,0.0,0.0,2.0,0.0,2.0,-2.0,0.0,0.0,0.0,0.0,2.0,40.0,0.0,0.0,-3.0,2.0,-4.0,-2.0,-72.0,0.0,0.0,0.0,1.0,0.0,-2.0,15.0,1.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,3.0,0.6000000000000001,0.0,-2.1,1.0,-9.0,0.0,-1.0,-1.0,5.0,0.0,0.0,0.0,-2.0,0.0,0.0,2.0,0.0,-2.0,-2.0,0.0,-5.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-5.0,0.0,0.0,4.0,2.0,2.0,3.0,4.0,0.0,-7.0,4.0,0.0,0.0,3.0,0.0,-8.0,0.0,4.0,-97.0,0.0,0.0,-2.0],"cost_curve":[0.0550575,0.055085,0.06307,0.0645225,0.0675825,0.0657475,0.06862,0.0702825,0.0693675,0.072545,0.076675,0.075635,0.0796175,0.0809775,0.0861225,0.089575,0.0907975,0.09158,0.096025,0.098735,0.0944775,0.15781,0.0952025,0.061565,0.06795,0.104075,0.0,0.044545,0.0,0.0658275,0.1105575,0.0,0.0487125,0.1486475,0.0,0.0,0.0931275,0.1238575,0.0756275,0.2268525,0.10487,0.1571975,0.1699525,0.0903625,0.1483475,0.18038,0.15336,0.1203475,0.0992025,0.153975,0.1600775,0.1530925,0.1592075,0.16457,0.16606,0.1760775,0.1705775,0.16646,0.169535,0.1706325,0.13782,0.0,0.0,0.182015,0.1083275,0.1846725,0.11241,0.18815,0.111905,0.1922775,0.1878525,0.1950575,0.0,0.1954075,0.19025,0.1579075,0.196635,0.0,0.20054,0.20328,0.1996025,0.0,0.2188675,0.2145425,0.1239175,0.2150175,0.2133425,0.2097875,0.1295675,0.2186825,0.0,0.21948,0.2224975,0.2178675,0.0,0.0,0.1306875,0.1714625,0.13805,0.13363,0.0,0.22748,0.1797575,0.2338225,0.2352175,0.2341625,0.0,0.27838,0.142465,0.24199,0.142525,0.2456625,0.1921075,0.29728,0.29199,0.1995275,0.2040775,0.3009225,0.0,0.308005]},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","run_index":1,"reward":240.5,"baseline_reward":141.9,"reference_reward":1138.5,"gain":98.6,"normalized_reward":0.10637874415364713,"normalized_gain":0.09893638370459562,"cost_usd":10.925745,"latency_seconds":5.084774,"instance_count":120,"reward_curve":[14.0,21.0,1.0,-1.0,4.0,4.0,-2.0,-1.0,4.8,2.4,-1.0,12.8,6.0,-2.0,1.0,-14.0,-6.0,-1.0,-12.0,2.0,-2.0,-2.0,0.0,1.0,-0.5,0.5,100.0,-4.0,2.0,0.5,-1.0,2.0,0.5,2.0,-1.0,0.5,-1.0,2.0,1.0,0.5,3.0,1.0,-5.0,1.0,-1.0,90.0,1.0,-5.0,2.0,1.0,-1.0,2.0,-4.0,-8.0,-7.0,2.0,-0.5,2.0,2.0,-3.0,7.0,1.0,-6.0,4.0,1.0,-4.0,0.5,0.5,-2.0,3.0,0.5,1.0,1.0,1.0,1.0,-2.0,1.0,4.0,-2.0,-2.0,0.5,0.5,1.0,0.5,-0.5,1.0,14.0,-3.0,-0.5,1.0,0.5,-2.0,1.0,1.0,2.0,-4.0,0.5,2.0,2.0,-1.0,-100.0,-0.5,2.0,0.5,2.0,2.0,-2.0,1.0,0.5,5.0,2.0,2.0,100.0,-4.0,2.0,0.5,1.0,1.0,-4.0,-2.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-2.0,1.0,4.0,-1.0,14.0,-14.0,1.0,-17.0,2.0,7.0,-1.0,-2.5,4.0,-1.0,5.4,6.0,2.0,-5.0,2.0,-3.0,1.0,-1.0,0.5,-2.0,0.5,-1.0,0.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,0.0,-1.0,50.0,-1.0,3.0,-2.0,-1.0,2.0,-3.0,74.0,2.0,1.0,2.0,2.0,1.0,-4.0,-16.0,1.0,6.0,1.0,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,-2.0,2.4,1.0,-2.4,1.0,16.0,0.5,-2.0,-1.0,-4.0,-1.0,0.5,-1.0,4.0,-1.0,0.5,1.0,-1.0,3.0,-1.0,1.0,7.0,1.0,-2.0,0.5,-1.0,-2.0,-1.0,0.5,0.5,1.0,2.0,4.0,1.0,0.5,-2.0,-4.0,-1.0,-1.0,-2.0,0.5,-93.0,-2.0,3.0,2.0,-1.0,2.0,5.0,-4.0,-2.0,100.0,2.0,0.5,-4.0],"gain_curve":[15.0,20.0,2.0,-5.0,6.0,3.0,-6.0,0.0,-9.2,16.4,-2.0,29.8,4.0,-9.0,2.0,-11.5,-10.0,0.0,-17.4,-4.0,-4.0,3.0,-2.0,4.0,-1.5,1.5,99.5,-2.0,1.5,1.5,-1.0,1.5,1.5,4.0,-1.5,0.0,1.0,2.0,2.0,-49.5,4.0,-2.0,-3.0,2.0,-3.0,93.0,-73.0,-7.0,1.0,-1.0,-3.0,1.0,0.0,8.0,-8.0,-4.0,-1.5,3.0,3.0,-2.0,6.0,0.5,-6.5,5.0,0.0,-5.0,2.5,-1.9,-3.0,5.4,-0.5,-15.0,0.5,3.0,2.0,2.0,2.0,3.5,-1.0,-6.0,1.5,0.0,0.0,1.5,-3.5,2.0,13.0,-10.0,-1.5,3.0,0.0,-1.0,3.0,2.0,1.5,-4.5,-0.5,0.0,-2.0,-2.0,-100.5,1.5,6.0,1.5,3.0,4.0,-2.5,94.0,2.5,2.0,0.0,3.0,98.0,-9.0,6.0,2.5,-99.0,-1.0,-4.5,2.0],"cost_curve":[0.0485825,0.0566175,0.0600225,0.059305,0.05982,0.061525,0.06571,0.0674775,0.06581,0.07352,0.0659525,0.0669775,0.072665,0.07563,0.0745575,0.0743,0.0802775,0.080965,0.0829175,0.0830625,0.06653,0.0657625,0.088475,0.0527525,0.0348575,0.0,0.0816875,0.112785,0.0918275,0.0,0.042535,0.075235,0.0,0.0796175,0.0658125,0.0,0.079005,0.084525,0.0647875,0.0,0.10314,0.06493,0.130375,0.06876,0.06567,0.15635,0.07362,0.1933875,0.0895125,0.044375,0.12216,0.12897,0.127665,0.1279675,0.127665,0.130415,0.0482225,0.1282475,0.1341725,0.13801,0.135285,0.081805,0.13779,0.08233,0.084295,0.142645,0.0,0.0,0.1446175,0.08581,0.0,0.087795,0.0873025,0.0857525,0.0895975,0.15439,0.0544975,0.0893725,0.1557325,0.1562575,0.0,0.0,0.0604075,0.0,0.0587425,0.0921525,0.155685,0.1606675,0.06475,0.097155,0.0,0.1690825,0.0656175,0.0940625,0.1691325,0.21107,0.0,0.1698025,0.102765,0.107085,0.1653575,0.0667075,0.1408725,0.0,0.1068125,0.14025,0.145585,0.143705,0.0,0.2132275,0.145725,0.11073,0.250585,0.21625,0.1508475,0.0,0.11494,0.115625,0.20203,0.1975375]},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","run_index":2,"reward":160.3,"baseline_reward":141.9,"reference_reward":1138.5,"gain":18.400000000000006,"normalized_reward":0.026569807941088684,"normalized_gain":0.018462773429660854,"cost_usd":12.2683025,"latency_seconds":4.65583,"instance_count":120,"reward_curve":[15.0,-4.0,-5.6,2.0,-2.0,6.0,-4.0,-12.0,-13.2,2.0,8.8,-1.0,-1.0,5.6,8.0,1.0,4.0,-1.0,4.0,1.0,-0.5,-0.5,-1.0,-3.0,-0.5,0.5,2.0,-3.0,0.5,0.5,-1.0,1.0,-0.5,-1.0,-1.0,0.5,-0.5,-0.5,-2.0,-1.0,-2.0,90.0,-1.0,2.0,2.0,0.5,0.0,-2.0,-1.0,-2.0,1.0,1.0,-4.0,-1.0,-2.0,2.0,6.0,-4.0,3.0,-8.0,0.5,2.0,0.5,-1.0,0.5,1.0,-1.0,-1.0,-2.0,-0.5,-1.0,3.0,-1.0,-3.0,-1.0,3.0,-2.0,1.0,-1.0,1.0,1.0,1.0,0.5,5.2,3.0,0.5,0.5,-1.0,-3.0,-1.0,2.0,-5.0,-1.0,0.5,-1.0,-1.0,1.0,2.0,5.0,-2.0,0.5,-3.0,-24.0,3.0,-4.0,-1.0,2.0,0.5,-1.0,100.0,2.0,-1.0,2.0,-1.0,-1.0,0.5,-2.0,0.5,1.0,-2.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-2.0,1.0,4.0,-1.0,14.0,-14.0,1.0,-17.0,2.0,7.0,-1.0,-2.5,4.0,-1.0,5.4,6.0,2.0,-5.0,2.0,-3.0,1.0,-1.0,0.5,-2.0,0.5,-1.0,0.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,0.0,-1.0,50.0,-1.0,3.0,-2.0,-1.0,2.0,-3.0,74.0,2.0,1.0,2.0,2.0,1.0,-4.0,-16.0,1.0,6.0,1.0,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,-2.0,2.4,1.0,-2.4,1.0,16.0,0.5,-2.0,-1.0,-4.0,-1.0,0.5,-1.0,4.0,-1.0,0.5,1.0,-1.0,3.0,-1.0,1.0,7.0,1.0,-2.0,0.5,-1.0,-2.0,-1.0,0.5,0.5,1.0,2.0,4.0,1.0,0.5,-2.0,-4.0,-1.0,-1.0,-2.0,0.5,-93.0,-2.0,3.0,2.0,-1.0,2.0,5.0,-4.0,-2.0,100.0,2.0,0.5,-4.0],"gain_curve":[16.0,-5.0,-4.6,-2.0,0.0,5.0,-8.0,-11.0,-27.2,16.0,7.800000000000001,16.0,-3.0,-1.4000000000000004,9.0,3.5,0.0,0.0,-1.4000000000000004,-5.0,-2.5,4.5,-3.0,0.0,-1.5,1.5,1.5,-1.0,0.0,1.5,-1.0,0.5,0.5,1.0,-1.5,0.0,1.5,-0.5,-1.0,-51.0,-1.0,87.0,1.0,3.0,0.0,3.5,-74.0,-4.0,-2.0,-4.0,-1.0,0.0,0.0,15.0,-3.0,-4.0,5.0,-3.0,4.0,-7.0,-0.5,1.5,0.0,0.0,-0.5,0.0,1.0,-3.4,-3.0,1.9,-2.0,-13.0,-1.5,-1.0,0.0,7.0,-1.0,0.5,0.0,-3.0,2.0,0.5,-0.5,6.2,0.0,1.5,-0.5,-8.0,-4.0,1.0,1.5,-4.0,1.0,1.5,-1.5,-1.5,0.0,0.0,1.0,-3.0,0.0,-1.0,-20.0,4.0,-3.0,1.0,1.5,93.5,1.0,97.0,0.0,0.0,0.0,-6.0,3.0,2.5,-102.0,-1.5,0.5,2.0],"cost_curve":[0.047505,0.0506775,0.05685,0.063895,0.0638475,0.065075,0.0705975,0.07801,0.0785275,0.079425,0.0778725,0.077775,0.0814325,0.0840925,0.08686,0.09039,0.08966,0.094555,0.0914125,0.097015,0.0342475,0.0347075,0.0559175,0.09523,0.0377375,0.0,0.1036425,0.100055,0.0,0.0,0.03671,0.1045775,0.03675,0.0387275,0.0748425,0.0,0.0391025,0.0460325,0.0806925,0.0859475,0.13446,0.1775875,0.0698925,0.1168575,0.13768,0.0,0.1159375,0.0927375,0.06897,0.1210575,0.1173625,0.122305,0.1245775,0.1249075,0.130515,0.1322725,0.1343475,0.12738,0.1351575,0.132695,0.0,0.1366125,0.0,0.14168,0.0,0.0800625,0.147375,0.1396225,0.14755,0.0572075,0.142005,0.1445625,0.1469675,0.1571,0.1483675,0.088815,0.15719,0.05803,0.15031,0.15511,0.061655,0.089225,0.0,0.160815,0.16829,0.0,0.0,0.1644725,0.1681,0.16605,0.1672575,0.173815,0.1765275,0.0,0.17312,0.1079875,0.1361,0.176375,0.215195,0.1881675,0.0,0.2213075,0.219445,0.186565,0.269495,0.159035,0.116725,0.0,0.1163025,0.1937625,0.158045,0.2016575,0.1566,0.121415,0.127265,0.0,0.22066,0.0,0.1273075,0.1339]},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","run_index":3,"reward":117.5,"baseline_reward":141.9,"reference_reward":1138.5,"gain":-24.400000000000006,"normalized_reward":-0.016021494676087166,"normalized_gain":-0.02448324302628939,"cost_usd":13.894865,"latency_seconds":4.750556,"instance_count":120,"reward_curve":[4.0,28.0,1.0,5.6,-2.0,-3.0,4.0,-6.0,-11.2,7.0,-2.0,1.0,-1.0,1.0,-1.0,-1.0,5.6,-1.0,12.0,14.0,-0.5,-3.0,0.0,-3.0,0.0,-2.0,2.0,-1.0,-2.0,-2.0,-1.0,0.5,0.5,2.0,0.5,-2.0,0.0,-5.0,2.0,2.0,90.0,2.0,-4.0,-2.0,0.5,-1.0,-1.0,-2.0,0.5,1.0,-6.0,-8.0,1.0,1.0,2.0,2.0,-4.0,-1.0,3.0,-1.0,1.0,3.0,0.5,22.0,-1.0,2.0,-1.0,1.0,-2.0,0.5,-2.0,4.0,0.5,-1.0,1.0,-1.0,1.0,-1.0,0.5,-3.0,1.0,0.5,0.5,-7.0,-1.0,1.0,1.0,1.0,0.5,-1.0,-1.0,-2.0,4.0,-1.0,4.0,-100.0,76.0,-2.0,0.5,0.5,-2.0,5.0,4.0,0.5,1.0,2.0,3.0,-1.0,-5.0,2.0,0.5,2.0,-4.0,-1.0,-4.0,-1.0,-2.0,2.0,-4.0,3.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-2.0,1.0,4.0,-1.0,14.0,-14.0,1.0,-17.0,2.0,7.0,-1.0,-2.5,4.0,-1.0,5.4,6.0,2.0,-5.0,2.0,-3.0,1.0,-1.0,0.5,-2.0,0.5,-1.0,0.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,0.0,-1.0,50.0,-1.0,3.0,-2.0,-1.0,2.0,-3.0,74.0,2.0,1.0,2.0,2.0,1.0,-4.0,-16.0,1.0,6.0,1.0,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,-2.0,2.4,1.0,-2.4,1.0,16.0,0.5,-2.0,-1.0,-4.0,-1.0,0.5,-1.0,4.0,-1.0,0.5,1.0,-1.0,3.0,-1.0,1.0,7.0,1.0,-2.0,0.5,-1.0,-2.0,-1.0,0.5,0.5,1.0,2.0,4.0,1.0,0.5,-2.0,-4.0,-1.0,-1.0,-2.0,0.5,-93.0,-2.0,3.0,2.0,-1.0,2.0,5.0,-4.0,-2.0,100.0,2.0,0.5,-4.0],"gain_curve":[5.0,27.0,2.0,1.5999999999999996,0.0,-4.0,0.0,-5.0,-25.2,21.0,-3.0,18.0,-3.0,-6.0,0.0,1.5,1.5999999999999996,0.0,6.6,8.0,-2.5,2.0,-2.0,0.0,-1.0,-1.0,1.5,1.0,-2.5,-1.0,-1.0,0.0,1.5,4.0,0.0,-2.5,2.0,-5.0,3.0,-48.0,91.0,-1.0,-2.0,-1.0,-1.5,2.0,-75.0,-4.0,-0.5,-1.0,-8.0,-9.0,5.0,17.0,1.0,-4.0,-5.0,0.0,4.0,0.0,0.0,2.5,0.0,23.0,-2.0,1.0,1.0,-1.4,-3.0,2.9,-3.0,-12.0,0.0,1.0,2.0,3.0,2.0,-1.5,1.5,-7.0,2.0,0.0,-0.5,-6.0,-4.0,2.0,0.0,-6.0,-0.5,1.0,-1.5,-1.0,6.0,0.0,3.5,-100.5,75.0,-4.0,-3.5,-0.5,-2.5,7.0,8.0,1.5,2.0,4.0,2.5,92.0,-3.0,-1.0,-1.5,3.0,-6.0,-6.0,0.0,1.0,-102.0,0.0,-4.5,7.0],"cost_curve":[0.052055,0.0529225,0.0522825,0.0550325,0.056745,0.0605225,0.0600275,0.0661925,0.0645725,0.06624,0.07231,0.07163,0.071205,0.0715775,0.0747175,0.0789275,0.08099,0.086805,0.086135,0.078715,0.0367175,0.10574,0.1085725,0.090245,0.089365,0.0699575,0.0937175,0.05386,0.095075,0.0750425,0.059635,0.0,0.0,0.10034,0.0,0.1277075,0.1044975,0.1700975,0.0867225,0.111135,0.1860375,0.13888,0.1415525,0.09905,0.0,0.075365,0.129635,0.1019175,0.0,0.1275,0.132215,0.1349725,0.1300875,0.13,0.1345525,0.1358025,0.1374675,0.1386725,0.14424,0.13972,0.08278,0.085115,0.0,0.1395375,0.1465325,0.1475325,0.152045,0.1551675,0.1563075,0.0,0.1660625,0.1615425,0.0,0.15739,0.164755,0.170655,0.1730225,0.169785,0.0,0.176835,0.16726,0.0,0.0,0.17418,0.167665,0.1771275,0.17377,0.0700575,0.0,0.1785425,0.177065,0.17859,0.1824575,0.1783975,0.184735,0.14909,0.3349875,0.1852875,0.0,0.0,0.1897325,0.225685,0.1516425,0.0,0.1531875,0.15207,0.1201925,0.11963,0.2032625,0.2468525,0.0,0.122675,0.2467925,0.162245,0.208515,0.2142475,0.21606,0.2579525,0.3087225,0.2154225]},{"run_name":"ace-gpt-5.4","task":"exploitable_poker","run_index":4,"reward":198.9,"baseline_reward":141.9,"reference_reward":1138.5,"gain":57.0,"normalized_reward":0.0649815902079809,"normalized_gain":0.0571944611679711,"cost_usd":13.17818,"latency_seconds":4.765531,"instance_count":120,"reward_curve":[10.0,-4.0,4.0,5.2,-1.0,4.0,4.0,1.0,1.0,-1.0,-7.0,-1.0,5.6,13.6,-16.0,-1.0,-1.0,1.0,4.0,-1.0,-0.5,0.5,90.0,-1.0,-0.5,0.0,2.0,-1.0,-5.0,90.0,-4.0,-2.0,0.5,-2.0,-2.0,-4.0,-1.0,0.0,-1.0,2.0,0.5,-0.5,1.0,0.5,0.5,2.0,1.0,2.0,2.0,-2.0,-1.0,-4.0,-4.0,-2.0,2.0,1.0,-1.0,3.0,2.0,1.0,1.0,4.0,1.0,0.5,-2.0,-1.0,-1.0,-1.0,4.0,0.5,-1.0,1.0,1.0,-3.0,2.0,1.0,-1.0,0.5,0.5,-1.0,0.5,-2.0,1.0,-1.0,-1.0,0.5,1.0,-1.0,16.0,1.0,0.5,-1.0,1.0,-1.0,1.0,1.0,-2.0,-1.0,2.0,-4.0,0.5,-1.0,2.0,-2.0,-3.0,-2.0,5.0,0.5,0.5,-2.0,2.0,-1.0,1.0,1.0,0.5,-2.0,3.0,-5.0,-4.0,2.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-2.0,1.0,4.0,-1.0,14.0,-14.0,1.0,-17.0,2.0,7.0,-1.0,-2.5,4.0,-1.0,5.4,6.0,2.0,-5.0,2.0,-3.0,1.0,-1.0,0.5,-2.0,0.5,-1.0,0.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,0.0,-1.0,50.0,-1.0,3.0,-2.0,-1.0,2.0,-3.0,74.0,2.0,1.0,2.0,2.0,1.0,-4.0,-16.0,1.0,6.0,1.0,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,-2.0,2.4,1.0,-2.4,1.0,16.0,0.5,-2.0,-1.0,-4.0,-1.0,0.5,-1.0,4.0,-1.0,0.5,1.0,-1.0,3.0,-1.0,1.0,7.0,1.0,-2.0,0.5,-1.0,-2.0,-1.0,0.5,0.5,1.0,2.0,4.0,1.0,0.5,-2.0,-4.0,-1.0,-1.0,-2.0,0.5,-93.0,-2.0,3.0,2.0,-1.0,2.0,5.0,-4.0,-2.0,100.0,2.0,0.5,-4.0],"gain_curve":[11.0,-5.0,5.0,1.2000000000000002,1.0,3.0,0.0,2.0,-13.0,13.0,-8.0,16.0,3.5999999999999996,6.6,-15.0,1.5,-5.0,2.0,-1.4000000000000004,-7.0,-2.5,5.5,88.0,2.0,-1.5,1.0,1.5,1.0,-5.5,91.0,-4.0,-2.5,1.5,0.0,-2.5,-4.5,1.0,0.0,0.0,-48.0,1.5,-3.5,3.0,1.5,-1.5,5.0,-73.0,0.0,1.0,-4.0,-3.0,-5.0,0.0,14.0,1.0,-5.0,-2.0,4.0,3.0,2.0,0.0,3.5,0.5,1.5,-3.0,-2.0,1.0,-3.4,3.0,2.9,-2.0,-15.0,0.5,-1.0,3.0,5.0,0.0,0.0,1.5,-5.0,1.5,-2.5,0.0,0.0,-4.0,1.5,0.0,-8.0,15.0,3.0,0.0,0.0,3.0,0.0,0.5,0.5,-3.0,-3.0,-2.0,-5.0,0.0,1.0,6.0,-1.0,-2.0,0.0,4.5,93.5,2.5,-5.0,0.0,0.0,-1.0,-4.0,4.5,0.0,-97.0,-7.0,-4.5,6.0],"cost_curve":[0.04581,0.053855,0.0549725,0.0567975,0.058875,0.0598525,0.0613975,0.0651775,0.0671325,0.0698025,0.0721225,0.06717,0.076965,0.071655,0.080115,0.07452,0.07816,0.0800775,0.0816775,0.08776,0.0315175,0.0,0.13318,0.09268,0.0352625,0.0894475,0.0581125,0.03994,0.16737,0.1606775,0.1212325,0.08353,0.0,0.12447,0.079855,0.1271575,0.0849175,0.1053725,0.0857125,0.1397225,0.0,0.0449525,0.06941,0.0,0.0,0.1173375,0.0515875,0.14388,0.128745,0.0987075,0.1251,0.12846,0.1262825,0.130895,0.1324325,0.1321875,0.14172,0.1423275,0.137535,0.1398275,0.14028,0.14035,0.0801825,0.0,0.1516525,0.1457,0.1451225,0.1476875,0.149755,0.0,0.1548225,0.1489375,0.1526075,0.1603125,0.153965,0.15622,0.1624325,0.0,0.0,0.16855,0.0,0.1710375,0.0671975,0.17443,0.16746,0.0,0.10467,0.1781575,0.1810425,0.1087925,0.0,0.18348,0.1127025,0.1887425,0.1839575,0.11059,0.1486825,0.15762,0.1539425,0.2261,0.0,0.11763,0.1504525,0.204565,0.246085,0.2080775,0.1621825,0.0,0.0,0.2437075,0.122935,0.2159525,0.12872,0.1254275,0.0,0.2201375,0.2189275,0.227825,0.3164975,0.176435]},{"run_name":"ace-gpt-5.4","task":"sales_prediction","run_index":0,"reward":6.3377,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain":1.1347000000000005,"normalized_reward":0.11563871491714435,"normalized_gain":0.166941297631308,"cost_usd":7.1096625,"latency_seconds":7.449512,"instance_count":12,"reward_curve":[0.602,0.5626,0.572,0.5137,0.6006,0.5264,0.5545,0.5157,0.4734,0.4704,0.4946,0.4518],"baseline_reward_curve":[0.4603,0.3444,0.432,0.5612,0.5394,0.3244,0.5338,0.3569,0.3256,0.5664,0.4185,0.3401],"gain_curve":[0.1417,0.2182,0.13999999999999996,-0.04749999999999999,0.06120000000000003,0.20199999999999996,0.02069999999999994,0.15880000000000005,0.1478,-0.09600000000000003,0.0761,0.11169999999999997],"cost_curve":[0.4776075,0.45622,0.42009,0.4828625,0.6492875,0.874785,0.57204,0.5131675,0.5055275,0.698335,0.7742375,0.6855025]},{"run_name":"ace-gpt-5.4","task":"sales_prediction","run_index":1,"reward":6.3401,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain":1.1371000000000002,"normalized_reward":0.11601355678073311,"normalized_gain":0.16729439458584672,"cost_usd":7.961065,"latency_seconds":7.054823,"instance_count":12,"reward_curve":[0.5496,0.5171,0.572,0.5566,0.5515,0.542,0.5361,0.5336,0.4445,0.5279,0.5065,0.5027],"baseline_reward_curve":[0.4603,0.3444,0.432,0.5612,0.5394,0.3244,0.5338,0.3569,0.3256,0.5664,0.4185,0.3401],"gain_curve":[0.08929999999999999,0.17270000000000002,0.13999999999999996,-0.0046000000000000485,0.0121,0.21760000000000002,0.0022999999999999687,0.17669999999999997,0.1189,-0.03849999999999998,0.08799999999999997,0.16260000000000002],"cost_curve":[0.3943325,0.4849125,0.6789375,0.66373,0.6215575,0.47951,0.7650125,0.78027,0.6053175,0.7436675,0.85119,0.8926275]},{"run_name":"ace-gpt-5.4","task":"sales_prediction","run_index":2,"reward":6.2737,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain":1.0707000000000004,"normalized_reward":0.10564293188810972,"normalized_gain":0.1575253788436075,"cost_usd":8.7063325,"latency_seconds":7.234341,"instance_count":12,"reward_curve":[0.6074,0.541,0.468,0.5621,0.5484,0.5492,0.527,0.5682,0.4752,0.4842,0.5069,0.4361],"baseline_reward_curve":[0.4603,0.3444,0.432,0.5612,0.5394,0.3244,0.5338,0.3569,0.3256,0.5664,0.4185,0.3401],"gain_curve":[0.14710000000000006,0.19660000000000005,0.03600000000000003,0.0009000000000000119,0.009000000000000008,0.2248,-0.006800000000000028,0.21130000000000004,0.1496,-0.0822,0.08840000000000003,0.09599999999999997],"cost_curve":[0.36306,0.5239825,0.7070625,0.4630625,0.5818925,0.668265,0.817175,1.05644,0.8785425,0.9395725,0.718215,0.9890625]},{"run_name":"ace-gpt-5.4","task":"sales_prediction","run_index":3,"reward":5.9221,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain":0.719100000000001,"normalized_reward":0.05072859887235082,"normalized_gain":0.10579667500367823,"cost_usd":8.051785,"latency_seconds":7.371843,"instance_count":12,"reward_curve":[0.3193,0.4795,0.571,0.4978,0.5268,0.4551,0.5886,0.5024,0.494,0.53,0.4843,0.4733],"baseline_reward_curve":[0.4603,0.3444,0.432,0.5612,0.5394,0.3244,0.5338,0.3569,0.3256,0.5664,0.4185,0.3401],"gain_curve":[-0.14100000000000001,0.1351,0.13899999999999996,-0.06340000000000001,-0.012599999999999945,0.13069999999999998,0.05479999999999996,0.14549999999999996,0.1684,-0.03639999999999999,0.06580000000000003,0.13319999999999999],"cost_curve":[0.5729275,0.702535,0.659545,0.7409175,0.7838925,0.800445,0.5506925,0.632385,0.601255,0.826525,0.546175,0.63449]},{"run_name":"ace-gpt-5.4","task":"sales_prediction","run_index":4,"reward":5.704700000000001,"baseline_reward":5.202999999999999,"reference_reward":12.0,"gain":0.5017000000000014,"normalized_reward":0.016774173395598894,"normalized_gain":0.0738119758717083,"cost_usd":9.98995,"latency_seconds":7.248919,"instance_count":12,"reward_curve":[0.5928,0.5167,0.4676,0.4693,0.524,0.4981,0.4717,0.5382,0.4268,0.4545,0.3408,0.4042],"baseline_reward_curve":[0.4603,0.3444,0.432,0.5612,0.5394,0.3244,0.5338,0.3569,0.3256,0.5664,0.4185,0.3401],"gain_curve":[0.1325,0.17230000000000006,0.03560000000000002,-0.09190000000000004,-0.01539999999999997,0.17369999999999997,-0.062100000000000044,0.18130000000000002,0.10120000000000001,-0.1119,-0.07769999999999999,0.06409999999999999],"cost_curve":[0.551965,0.437095,0.5949,0.765045,1.0305975,1.0191325,1.045165,0.9341275,0.7493,0.8298625,1.2676225,0.7651375]},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":0,"reward":39.60990000000002,"baseline_reward":19.7601,"reference_reward":90.0,"gain":19.849800000000016,"normalized_reward":0.28259086832101843,"normalized_gain":0.2826000606492893,"cost_usd":4.81815915,"latency_seconds":13.412637,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3561,0.3601,0.3697,0.3604,0.351,0.3442,0.3333,0.3601,0.3713,0.3759,0.489,0.4608,0.4763,0.5012,0.5075,0.5019,0.516,0.513,0.5206,0.5218,0.5056,0.4908,0.472,0.4613,0.4399,0.4368,0.4225,0.4987,0.4975,0.5044,0.4899,0.4812,0.5039,0.5139,0.5085,0.4889,0.4921,0.5223,0.496,0.5059,0.5073,0.4953,0.5339,0.5921,0.6205,0.605,0.5906,0.5834,0.5861,0.5814,0.6178,0.6223,0.6089,0.534,0.5374,0.533,0.4988,0.5437,0.5127,0.4616,0.4184,0.445,0.4069,0.4015,0.4224,0.3995,0.3915,0.4365,0.4647,0.4369,0.4346,0.3608,0.356,0.3756,0.3656,0.3915,0.3487,0.3482,0.266,0.204,0.2222,0.2199,0.2283,0.2123,0.2197,0.2257,0.2638],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.12970000000000004,0.13599999999999998,0.15689999999999998,0.1331,0.15599999999999997,0.12210000000000001,0.12069999999999997,0.11969999999999997,0.1428,0.15660000000000002,0.2407,0.2688,0.27890000000000004,0.2773,0.28049999999999997,0.2954,0.2686,0.31120000000000003,0.3187,0.3088000000000001,0.2973,0.2664,0.23869999999999997,0.2519,0.22940000000000002,0.20560000000000003,0.2153,0.3005,0.28900000000000003,0.29489999999999994,0.2872,0.25770000000000004,0.29000000000000004,0.31100000000000005,0.26709999999999995,0.29159999999999997,0.2718,0.2959,0.3034,0.2662,0.28569999999999995,0.268,0.30650000000000005,0.37059999999999993,0.38960000000000006,0.3717,0.3619,0.3657,0.3645999999999999,0.3739,0.4051,0.39769999999999994,0.3837,0.33420000000000005,0.3013,0.3375,0.2832,0.30179999999999996,0.3013,0.24500000000000002,0.1974,0.2469,0.1914,0.1743,0.16720000000000002,0.1907,0.1703,0.1824,0.2508,0.1897,0.20429999999999998,0.14,0.11829999999999999,0.1334,0.15269999999999997,0.14270000000000002,0.14900000000000002,0.1403,0.048400000000000026,-0.0126,0.0121,0.0006000000000000172,0.027900000000000008,0.012699999999999989,0.018000000000000016,-0.01849999999999999,0.04159999999999997],"cost_curve":[0.03293445,0.055296,0.0401079,0.0505572,0.05027235,0.0502203,0.05771355,0.04054605,0.04633125,0.06419775,0.06676845,0.0700833,0.03140355,0.05513925,0.10086045,0.0843534,0.0376629,0.0420762,0.03970755,0.0400101,0.03888285,0.03964995,0.03982785,0.0742134,0.0462069,0.0493422,0.05182905,0.04858755,0.0473517,0.05362905,0.05339715,0.05284665,0.0528894,0.051621,0.0518934,0.0525948,0.0566874,0.0547575,0.05475945,0.05567295,0.05638695,0.0565953,0.0577425,0.0584556,0.0642543,0.0616296,0.06204315,0.06236205,0.06313995,0.06316395,0.0640722,0.06475125,0.0652188,0.06616305,0.07053735,0.0729489,0.0704721,0.06992865,0.07063005,0.0715746,0.0727818,0.1607343,0.0260976,0.0271857,0.02772705,0.0286179,0.02935455,0.03090195,0.0350826,0.0328965,0.03401955,0.04042035,0.04306245,0.0418437,0.0393501,0.0424392,0.0422928,0.04491735,0.04457415,0.0462099,0.0477096,0.048165,0.04938915,0.04939245,0.05731065,0.0564834,0.06222195,0.05961495,0.06024435,0.0621657]},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":1,"reward":44.48330000000001,"baseline_reward":19.7601,"reference_reward":90.0,"gain":24.723200000000006,"normalized_reward":0.351973974572531,"normalized_gain":0.35198227787909725,"cost_usd":14.02406595,"latency_seconds":34.944355,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2484,0.2674,0.276,0.273,0.2834,0.339,0.3347,0.3536,0.4419,0.4272,0.4275,0.4275,0.3969,0.3996,0.3993,0.3996,0.3879,0.4047,0.3949,0.4025,0.395,0.4013,0.4201,0.421,0.4219,0.422,0.422,0.4264,0.4415,0.442,0.4381,0.4435,0.4802,0.4909,0.4879,0.4945,0.4915,0.4636,0.5006,0.5135,0.513,0.5429,0.5408,0.5554,0.559,0.5773,0.588,0.587,0.5859,0.5742,0.5753,0.5837,0.582,0.5809,0.5961,0.6008,0.6019,0.6043,0.5992,0.6057,0.6138,0.6223,0.619,0.6245,0.5755,0.5788,0.5788,0.5669,0.5704,0.5704,0.571,0.571,0.5698,0.5675,0.5675,0.5734,0.5778,0.5778,0.5812,0.5806,0.5887,0.5764,0.5742,0.5582,0.561,0.5602,0.5602,0.5588],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03670000000000001,0.041000000000000036,0.05190000000000003,0.06020000000000003,0.05609999999999998,0.14400000000000002,0.1126,0.14100000000000001,0.2015,0.19870000000000002,0.2082,0.1792,0.20489999999999997,0.20220000000000002,0.1754,0.1726,0.18140000000000003,0.1573,0.19309999999999997,0.20060000000000003,0.18200000000000002,0.19299999999999998,0.19569999999999999,0.18769999999999998,0.2125,0.2115,0.1908,0.2192,0.24330000000000002,0.2335,0.2286,0.24080000000000001,0.25670000000000004,0.277,0.28500000000000003,0.2531,0.2942,0.24330000000000002,0.27420000000000005,0.32089999999999996,0.2733,0.32130000000000003,0.31349999999999995,0.328,0.3375,0.34640000000000004,0.35469999999999996,0.35829999999999995,0.36819999999999997,0.3527,0.3678,0.371,0.35739999999999994,0.35569999999999996,0.3963,0.3647,0.4064,0.38869999999999993,0.35729999999999995,0.3943,0.3972,0.4013,0.4209,0.40900000000000003,0.3483,0.3236,0.37,0.34569999999999995,0.3163,0.35650000000000004,0.3238,0.34069999999999995,0.349,0.3298,0.32530000000000003,0.36050000000000004,0.32899999999999996,0.3781,0.3733000000000001,0.363,0.3721,0.3663,0.35490000000000005,0.3578,0.36140000000000005,0.35850000000000004,0.31600000000000006,0.33659999999999995],"cost_curve":[0.059175,0.075801,0.0495069,0.0578982,0.0620094,0.0505821,0.06868695,0.07606035,0.06624495,0.1011126,0.11115,0.1148151,0.0748002,0.0767709,0.1300182,0.15593505,0.1375344,0.1132743,0.1775301,0.1257975,0.16074525,0.25452975,0.17110755,0.1424196,0.2139006,0.18222885,0.2246304,0.1223451,0.19908735,0.20343255,0.24851715,0.21523965,0.2193441,0.1850061,0.1882173,0.2342127,0.23808585,0.24343365,0.3522552,0.3671655,0.36835125,0.32483145,0.27801435,0.39345825,0.34643715,0.1978722,0.0797094,0.07411365,0.0841272,0.0968031,0.08904555,0.0916689,0.0904404,0.1007013,0.10963425,0.10429275,0.110373,0.11591535,0.12344895,0.1162341,0.12211635,0.12503145,0.1440012,0.12819045,0.13486335,0.13999575,0.1561812,0.15123555,0.1514859,0.1641846,0.1565151,0.15942915,0.16253505,0.1711998,0.1684428,0.19742055,0.09610245,0.14840415,0.11896185,0.0934497,0.12036855,0.1266846,0.13138125,0.1577436,0.1444815,0.2088774,0.16102455,0.13316805,0.19840815,0.20610195]},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":2,"reward":44.932700000000004,"baseline_reward":19.7601,"reference_reward":90.0,"gain":25.172600000000003,"normalized_reward":0.3583721294437563,"normalized_gain":0.3583803507692921,"cost_usd":5.02212105,"latency_seconds":13.438775,"instance_count":90,"reward_curve":[0.2482,0.2809,0.304,0.2963,0.3488,0.3347,0.4027,0.4117,0.4148,0.3441,0.3731,0.4023,0.3577,0.3558,0.331,0.3731,0.3953,0.4024,0.4457,0.4031,0.4046,0.4218,0.4218,0.4209,0.421,0.442,0.442,0.4245,0.4387,0.4849,0.5296,0.5253,0.5194,0.4831,0.4689,0.5133,0.5643,0.5618,0.5243,0.5191,0.5082,0.4847,0.5089,0.5209,0.5276,0.5641,0.5792,0.5915,0.5807,0.6162,0.6177,0.6007,0.6379,0.5871,0.5537,0.5363,0.5821,0.5935,0.6499,0.5437,0.5162,0.5313,0.4843,0.5931,0.5498,0.4359,0.5279,0.5423,0.6048,0.6214,0.4938,0.5245,0.557,0.5142,0.495,0.5408,0.6304,0.5575,0.5281,0.6122,0.5896,0.6221,0.6572,0.5891,0.6049,0.534,0.5838,0.5561,0.5413,0.6525],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.03269999999999998,0.0923,0.06990000000000002,0.1247,0.12190000000000001,0.1754,0.2167,0.1927,0.1315,0.13269999999999998,0.17379999999999998,0.13840000000000002,0.10750000000000001,0.139,0.1757,0.1714,0.17539999999999997,0.2392,0.1557,0.2028,0.2199,0.2088,0.21259999999999998,0.1966,0.2087,0.2326,0.214,0.2075,0.2777,0.3314,0.31679999999999997,0.30989999999999995,0.2804,0.24539999999999998,0.2994,0.36140000000000005,0.32039999999999996,0.32699999999999996,0.2988,0.2818,0.2921,0.2692,0.2993,0.30029999999999996,0.33670000000000005,0.3577,0.36060000000000003,0.3474,0.38749999999999996,0.4,0.3792,0.4304,0.37439999999999996,0.32909999999999995,0.3111,0.3823,0.35740000000000005,0.4544,0.32809999999999995,0.2743,0.31989999999999996,0.26770000000000005,0.3721,0.35169999999999996,0.2204,0.3007,0.2871,0.396,0.40019999999999994,0.23970000000000002,0.3106,0.3098000000000001,0.2839,0.2742,0.3030999999999999,0.3882,0.3446,0.2793,0.4125,0.38170000000000004,0.40449999999999997,0.4406,0.37899999999999995,0.3856,0.3336,0.3842,0.35440000000000005,0.29710000000000003,0.43029999999999996],"cost_curve":[0.05516415,0.05832825,0.0356346,0.04666635,0.0532989,0.0526812,0.06179025,0.0688122,0.0702957,0.06871815,0.06782235,0.06742905,0.06777,0.06690345,0.0742824,0.075258,0.07721715,0.0739485,0.08018235,0.07948095,0.07702005,0.0817284,0.0802578,0.0798666,0.0801924,0.0951219,0.04310355,0.0463083,0.0467331,0.09926295,0.0489024,0.04723275,0.04979025,0.05600775,0.05107515,0.11056905,0.05251515,0.056796,0.05738655,0.05830875,0.05531745,0.05470095,0.0588576,0.0587475,0.0570936,0.0574701,0.05758935,0.0586512,0.058788,0.1281399,0.0630528,0.0612513,0.06855135,0.0637053,0.0626034,0.06339825,0.06425835,0.06441135,0.1775865,0.02511735,0.0253854,0.0252522,0.02785725,0.0287085,0.02699295,0.0276834,0.0300714,0.0302781,0.0312678,0.03060285,0.03635055,0.03274965,0.0328071,0.03300015,0.0345237,0.03426705,0.0347919,0.0357624,0.0370761,0.0373788,0.03723705,0.03792945,0.03885345,0.0396039,0.03993165,0.04093755,0.04169655,0.04293195,0.0440472,0.04699065]},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":3,"reward":43.70300000000004,"baseline_reward":19.7601,"reference_reward":90.0,"gain":23.942900000000037,"normalized_reward":0.3408647617420526,"normalized_gain":0.340873207393519,"cost_usd":12.8730495,"latency_seconds":27.491732,"instance_count":90,"reward_curve":[0.192,0.2393,0.2283,0.2409,0.2461,0.2501,0.2805,0.3294,0.3392,0.3554,0.3562,0.3602,0.348,0.3517,0.3624,0.3635,0.3664,0.396,0.3858,0.3883,0.3809,0.392,0.3891,0.4255,0.43,0.43,0.5,0.4883,0.508,0.4841,0.5047,0.4563,0.4946,0.5087,0.5039,0.4603,0.4608,0.4188,0.4823,0.4888,0.5066,0.5563,0.5609,0.5855,0.5574,0.583,0.5725,0.5605,0.5605,0.5605,0.5605,0.5605,0.5605,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563,0.563],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.016600000000000004,0.014500000000000013,0.02200000000000002,0.0373,0.053200000000000025,0.13440000000000002,0.11710000000000001,0.14279999999999998,0.11580000000000001,0.1317,0.12869999999999998,0.10340000000000002,0.1704,0.1661,0.14250000000000002,0.169,0.1793,0.14089999999999997,0.1791,0.19010000000000002,0.1761,0.21719999999999998,0.2056,0.19669999999999999,0.29059999999999997,0.27780000000000005,0.27680000000000005,0.2769,0.30650000000000005,0.2478,0.2851,0.30600000000000005,0.2804,0.24639999999999998,0.2579,0.1774,0.28500000000000003,0.2685,0.28020000000000006,0.3637,0.32119999999999993,0.3639,0.3301,0.35559999999999997,0.351,0.3296,0.3272,0.3318,0.3428,0.33899999999999997,0.353,0.35029999999999994,0.3383999999999999,0.33779999999999993,0.36319999999999997,0.32689999999999997,0.36749999999999994,0.34739999999999993,0.32109999999999994,0.3515999999999999,0.34639999999999993,0.34199999999999997,0.36489999999999995,0.3474999999999999,0.33579999999999993,0.30779999999999996,0.35419999999999996,0.34179999999999994,0.30889999999999995,0.34909999999999997,0.31579999999999997,0.33269999999999994,0.34219999999999995,0.3252999999999999,0.3208,0.35009999999999997,0.3141999999999999,0.36329999999999996,0.35509999999999997,0.34539999999999993,0.34639999999999993,0.35289999999999994,0.34369999999999995,0.3625999999999999,0.36339999999999995,0.36129999999999995,0.3188,0.34079999999999994],"cost_curve":[0.0526098,0.06659835,0.0388887,0.0396297,0.04191225,0.04298775,0.0479859,0.0490068,0.052578,0.05454465,0.05339805,0.0538104,0.05910435,0.05978595,0.06377655,0.06909975,0.0667989,0.0713709,0.076449,0.07401645,0.07726335,0.08063925,0.0814953,0.08712735,0.08652525,0.0932424,0.09608085,0.0995613,0.09715005,0.0995166,0.10243185,0.1065921,0.10679685,0.11513175,0.11758695,0.1157724,0.11787525,0.120708,0.12346515,0.1246995,0.12807615,0.1336122,0.137295,0.1351506,0.13840095,0.1396068,0.14110965,0.28425345,0.1035258,0.0671073,0.08843055,0.09903975,0.09743445,0.0966936,0.07337055,0.0926118,0.1182756,0.14421915,0.10544085,0.1328328,0.16335555,0.1701108,0.1773618,0.18565995,0.1302843,0.1664343,0.23229345,0.16141065,0.2176428,0.26106045,0.23113155,0.20219055,0.32330865,0.25314855,0.26045925,0.2678169,0.3649557,0.23671095,0.28924335,0.29609445,0.3536409,0.3090111,0.31637235,0.32478405,0.33098805,0.394617,0.3274683,0.09396105,0.0820542,0.10897785]},{"run_name":"claude-code-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":4,"reward":48.6819,"baseline_reward":19.7601,"reference_reward":90.0,"gain":28.921799999999998,"normalized_reward":0.4117498825438858,"normalized_gain":0.4117574199279896,"cost_usd":15.24890715,"latency_seconds":30.358283,"instance_count":90,"reward_curve":[0.2273,0.2631,0.3401,0.3546,0.339,0.4145,0.4221,0.4201,0.4013,0.4126,0.4133,0.4135,0.462,0.462,0.4474,0.4514,0.4505,0.4324,0.4334,0.4248,0.4223,0.427,0.4274,0.4274,0.4255,0.4266,0.4344,0.4304,0.4319,0.43,0.4395,0.4439,0.4697,0.478,0.4794,0.4706,0.478,0.523,0.5297,0.5527,0.5532,0.5173,0.5053,0.5163,0.5873,0.5892,0.5769,0.5712,0.5883,0.5851,0.5903,0.5888,0.5881,0.5995,0.6012,0.6085,0.6068,0.6236,0.6222,0.6304,0.6361,0.6246,0.6343,0.6356,0.6312,0.6475,0.6083,0.6076,0.5964,0.5994,0.5952,0.5957,0.5996,0.5997,0.5981,0.6036,0.604,0.5982,0.7249,0.7216,0.726,0.7188,0.7202,0.7206,0.7268,0.7357,0.7409,0.7805,0.7814,0.7871],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.014899999999999997,0.12840000000000001,0.12820000000000004,0.11490000000000003,0.2017,0.19479999999999997,0.22509999999999997,0.1792,0.2,0.1729,0.18499999999999997,0.24270000000000003,0.21370000000000003,0.2554,0.254,0.22660000000000002,0.2054,0.22690000000000002,0.1774,0.2205,0.2251,0.2144,0.2191,0.2011,0.19329999999999997,0.225,0.2199,0.20070000000000002,0.2228,0.24130000000000001,0.23540000000000003,0.2602,0.2753,0.2559,0.25670000000000004,0.2751,0.2816,0.3323999999999999,0.3324,0.32680000000000003,0.3247,0.26559999999999995,0.29469999999999996,0.36000000000000004,0.36179999999999995,0.35539999999999994,0.34030000000000005,0.35500000000000004,0.35639999999999994,0.37260000000000004,0.36729999999999996,0.38059999999999994,0.38680000000000003,0.37659999999999993,0.38330000000000003,0.40700000000000003,0.38750000000000007,0.42669999999999997,0.41479999999999995,0.3942,0.4132,0.41769999999999996,0.4146000000000001,0.4331,0.43199999999999994,0.38109999999999994,0.35240000000000005,0.38760000000000006,0.37820000000000004,0.34109999999999996,0.38180000000000003,0.35240000000000005,0.3694,0.37729999999999997,0.3659,0.3618,0.3853,0.47609999999999997,0.5219,0.5181,0.5012,0.5035999999999999,0.5105,0.5075000000000001,0.5353,0.5413,0.5788,0.5372,0.5649],"cost_curve":[0.0569079,0.0595992,0.04115505,0.04842255,0.05373045,0.0544467,0.06129885,0.0587244,0.0990666,0.11253,0.07458645,0.0802281,0.10058595,0.0773664,0.12978645,0.16792875,0.09497235,0.1246302,0.1257963,0.1897839,0.13062645,0.13660095,0.10649475,0.10823265,0.11957595,0.11734125,0.18739395,0.1558431,0.131046,0.1731573,0.2110683,0.1771458,0.2313666,0.2270169,0.1903911,0.25940475,0.24523485,0.3501402,0.25713885,0.3130485,0.1709274,0.29383905,0.28354275,0.34388205,0.32850285,0.13781265,0.0937377,0.10125555,0.09153015,0.11097705,0.09614265,0.1601781,0.1046742,0.134223,0.1420572,0.14359365,0.2173497,0.2201058,0.16674165,0.1414662,0.1806234,0.186558,0.18986325,0.2073006,0.23966955,0.24766665,0.27458055,0.1515147,0.2744241,0.27986415,0.19785015,0.29317665,0.40611375,0.35890965,0.32047485,0.3275664,0.334245,0.45570945,0.42404625,0.06061545,0.06005985,0.0611169,0.06393645,0.06596985,0.0687285,0.0705246,0.06815295,0.09003675,0.08552625,0.0817284]},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","run_index":0,"reward":9.35,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain":3.45,"normalized_reward":-0.010471204188481638,"normalized_gain":0.2633587786259542,"cost_usd":6.8819436,"latency_seconds":5.882549,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.85,0.4,0.9,0.75,0.825,0.675,0.85,0.0,0.925,0.9,0.0,0.0,0.725,0.65,0.9],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.6,0.75,0.0,0.0,0.7,0.575,0.85,0.0,0.9,0.85,0.55,0.0,0.0,0.125,0.0],"gain_curve":[0.0,0.0,0.0,0.0,0.25,-0.35,0.9,0.75,0.125,0.10000000000000009,0.0,0.0,0.025000000000000022,0.050000000000000044,-0.55,0.0,0.725,0.525,0.9],"cost_curve":[0.08875635,0.1648641,0.0928317,0.0778515,0.12610485,0.8213064,0.1519188,0.3376353,0.287412,0.55636545,0.2747886,0.8626299,0.18007575,0.2541294,0.68342535,1.13554605,0.24903375,0.3887505,0.14851785]},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","run_index":1,"reward":9.075,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain":3.175,"normalized_reward":-0.03926701570680628,"normalized_gain":0.24236641221374042,"cost_usd":8.28912765,"latency_seconds":5.862116,"instance_count":19,"reward_curve":[0.775,0.85,0.0,0.0,0.8,0.0,0.45,0.775,0.0,0.45,0.95,0.6,0.0,0.225,0.875,0.925,0.525,0.875,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.6,0.75,0.0,0.0,0.7,0.575,0.85,0.0,0.9,0.85,0.55,0.0,0.0,0.125,0.0],"gain_curve":[0.775,0.85,0.0,0.0,0.20000000000000007,-0.75,0.45,0.775,-0.7,-0.12499999999999994,0.09999999999999998,0.6,-0.9,-0.625,0.32499999999999996,0.925,0.525,0.75,0.0],"cost_curve":[0.2072691,0.12714075,0.32960505,0.24996345,0.2076708,0.33225165,1.01356185,0.43680255,0.1787298,1.05245175,0.1469826,0.92828325,0.98995965,0.77478915,0.15538635,0.125028,0.65799075,0.22304745,0.1522137]},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","run_index":2,"reward":8.625,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain":2.7250000000000005,"normalized_reward":-0.08638743455497375,"normalized_gain":0.20801526717557253,"cost_usd":6.6643059,"latency_seconds":5.350672,"instance_count":19,"reward_curve":[0.0,0.7,0.0,0.0,0.875,0.0,0.0,0.0,0.85,0.375,0.0,0.5,0.875,0.525,0.875,0.8,0.825,0.475,0.95],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.6,0.75,0.0,0.0,0.7,0.575,0.85,0.0,0.9,0.85,0.55,0.0,0.0,0.125,0.0],"gain_curve":[0.0,0.7,0.0,0.0,0.275,-0.75,0.0,0.0,0.15000000000000002,-0.19999999999999996,-0.85,0.5,-0.025000000000000022,-0.32499999999999996,0.32499999999999996,0.8,0.825,0.35,0.95],"cost_curve":[0.07042815,0.22263975,0.0705708,0.24505845,0.1270998,0.10671555,0.28564845,0.4321485,0.2185746,0.9296844,0.51565185,1.0027494,0.30681975,1.20136845,0.08009925,0.1269561,0.16018455,0.48569925,0.07620885]},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","run_index":3,"reward":0.0,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain":-5.8999999999999995,"normalized_reward":-0.9895287958115182,"normalized_gain":-0.4503816793893129,"cost_usd":5.65598415,"latency_seconds":54.759176,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.6,0.75,0.0,0.0,0.7,0.575,0.85,0.0,0.9,0.85,0.55,0.0,0.0,0.125,0.0],"gain_curve":[0.0,0.0,0.0,0.0,-0.6,-0.75,0.0,0.0,-0.7,-0.575,-0.85,0.0,-0.9,-0.85,-0.55,0.0,0.0,-0.125,0.0],"cost_curve":[0.24197385,0.33051405,0.3463737,0.21238305,0.2597763,0.1658721,0.21315945,0.1461699,0.1612425,0.4724541,0.2300688,0.1087836,0.6533037,0.285384,0.38666325,0.38380245,0.48819165,0.3084408,0.2614269]},{"run_name":"claude-code-sonnet-4.6","task":"codebase_adaptation","run_index":4,"reward":6.1000000000000005,"baseline_reward":5.8999999999999995,"reference_reward":19.0,"gain":0.20000000000000107,"normalized_reward":-0.350785340314136,"normalized_gain":0.015267175572519163,"cost_usd":6.41458365,"latency_seconds":5.874646,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.65,0.0,0.85,0.0,0.85,0.0,0.0,0.875,0.925,0.0,0.7,0.0,0.0,0.0,0.375,0.875],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.6,0.75,0.0,0.0,0.7,0.575,0.85,0.0,0.9,0.85,0.55,0.0,0.0,0.125,0.0],"gain_curve":[0.0,0.0,0.0,0.65,-0.6,0.09999999999999998,0.0,0.85,-0.7,-0.575,0.025000000000000022,0.925,-0.9,-0.15000000000000002,-0.55,0.0,0.0,0.25,0.875],"cost_curve":[0.0854403,0.0873006,0.21453615,0.2782278,0.24656625,0.20448765,0.1121883,0.2248476,0.2471613,0.1245726,0.2061966,0.13335885,0.6045195,0.56392125,0.2922567,0.23424435,2.0173938,0.39977715,0.1375869]},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","run_index":0,"reward":0.2668,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain":-0.5378000000000001,"normalized_reward":-0.32348835761978,"normalized_gain":-0.22046043354212447,"cost_usd":7.81926855,"latency_seconds":25.341852,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0397,0.0,0.0,0.0,0.0,0.2271,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0788,0.1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.0943,0.3725,0.0,0.1155],"gain_curve":[0.0,0.0,0.0,0.0,0.0,-0.0788,-0.1125,0.0,0.0,0.0397,0.0,0.0,0.0,0.0,0.1961,0.0,-0.0943,-0.3725,0.0,-0.1155],"cost_curve":[0.72739695,0.21972375,0.2152773,0.2286951,0.4901157,0.52755135,0.2587596,0.29968875,0.4353225,0.3176307,0.29312025,0.64796715,0.52386225,0.69639435,0.614358,0.1788474,0.35452155,0.22831185,0.29997345,0.2617506]},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","run_index":1,"reward":0.1103,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain":-0.6943000000000001,"normalized_reward":-0.3930581363300941,"normalized_gain":-0.2846145016889123,"cost_usd":7.703682,"latency_seconds":24.914501,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0788,0.1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.0943,0.3725,0.0,0.1155],"gain_curve":[0.0,0.0,0.0,0.0,0.0,-0.0788,-0.1125,0.0,0.0,0.1103,0.0,0.0,0.0,0.0,-0.031,0.0,-0.0943,-0.3725,0.0,-0.1155],"cost_curve":[0.9596706,0.25739415,0.2332251,0.24714675,1.0653723,0.17805795,0.1956882,0.2040255,0.54733695,0.25749855,0.59673165,0.2338773,0.3684498,0.3206157,0.2505978,0.5654592,0.4682241,0.2183154,0.22416015,0.31183485]},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","run_index":2,"reward":0.6846000000000001,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain":-0.12,"normalized_reward":-0.13776149790623854,"normalized_gain":-0.049191617748335685,"cost_usd":5.9895786,"latency_seconds":21.507447,"instance_count":20,"reward_curve":[0.0776,0.0,0.0,0.0,0.1506,0.0,0.0,0.0,0.0518,0.0,0.0,0.0,0.0,0.0,0.092,0.0,0.1376,0.0723,0.0868,0.0159],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0788,0.1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.0943,0.3725,0.0,0.1155],"gain_curve":[0.0776,0.0,0.0,0.0,0.1506,-0.0788,-0.1125,0.0,0.0518,0.0,0.0,0.0,0.0,0.0,0.061,0.0,0.043300000000000005,-0.3002,0.0868,-0.09960000000000001],"cost_curve":[0.45303915,0.23593635,0.21280875,0.2297298,0.4240548,0.28308285,0.37135995,0.18125025,0.3322152,0.2130363,0.22851225,0.2466996,0.6902742,0.23968185,0.207501,0.2099646,0.3358092,0.2561844,0.26659605,0.37184205]},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","run_index":3,"reward":1.2978,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain":0.49319999999999997,"normalized_reward":0.13482756474657046,"normalized_gain":0.20217754894565967,"cost_usd":6.09519465,"latency_seconds":20.283678,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1131,0.1428,0.0,0.0238,0.0,0.0,0.3145,0.1275,0.0,0.0,0.0,0.4117,0.1644],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0788,0.1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.0943,0.3725,0.0,0.1155],"gain_curve":[0.0,0.0,0.0,0.0,0.0,-0.0788,-0.1125,0.1131,0.1428,0.0,0.0238,0.0,0.0,0.3145,0.0965,0.0,-0.0943,-0.3725,0.4117,0.048899999999999985],"cost_curve":[0.46309395,0.3490551,0.25811685,0.2875149,0.54906135,0.43733175,0.1985943,0.19685955,0.41522415,0.24812925,0.27188355,0.35219205,0.2870322,0.1947375,0.2025708,0.21900885,0.3117426,0.2573685,0.4151994,0.18047805]},{"run_name":"claude-code-sonnet-4.6","task":"cohort_studies","run_index":4,"reward":0.1206,"baseline_reward":0.8046000000000001,"reference_reward":3.24404,"gain":-0.684,"normalized_reward":-0.3884794224597028,"normalized_gain":-0.28039222116551343,"cost_usd":8.4630486,"latency_seconds":33.648865,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0788,0.1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.0943,0.3725,0.0,0.1155],"gain_curve":[0.0,0.0,0.0,0.0,0.0,-0.0788,-0.1125,0.0,0.1206,0.0,0.0,0.0,0.0,0.0,-0.031,0.0,-0.0943,-0.3725,0.0,-0.1155],"cost_curve":[0.67170165,0.39923295,0.35532975,0.3865164,0.6607842,0.27083655,0.323934,0.26987985,0.7327623,0.5082738,0.29539395,0.22625235,0.6756198,0.29835015,0.5577765,0.2311749,0.3140808,0.25861665,0.2779626,0.74856945]},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","run_index":0,"reward":24.53333333333334,"baseline_reward":8.2,"reference_reward":40.0,"gain":16.33333333333334,"normalized_reward":0.5512572533849132,"normalized_gain":0.5136268343815515,"cost_usd":3.72428805,"latency_seconds":6.964436,"instance_count":40,"reward_curve":[0.5333333333333333,0.0,0.8666666666666667,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.7333333333333334,0.9333333333333333,0.8,0.9333333333333333,0.8666666666666667,0.5333333333333333,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.6,0.6,0.5333333333333333,0.4666666666666667,0.4666666666666667,0.0,0.6,0.4,0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.6,0.5333333333333333,0.0],"gain_curve":[0.5333333333333333,0.0,0.2666666666666667,0.2666666666666667,0.4,0.4666666666666667,0.2666666666666667,0.9333333333333333,0.20000000000000007,0.5333333333333333,0.4,-0.06666666666666665,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.2666666666666666,0.8,0.9333333333333333,0.0,0.0,-0.5333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,-0.5333333333333333,0.33333333333333337,-0.6,0.4,0.9333333333333333],"cost_curve":[0.11467335,0.04465905,0.0396768,0.04851165,0.0257559,0.02709855,0.07468215,0.03412095,0.06423675,0.03303615,0.04975665,0.1868268,0.0409179,0.0645339,0.04916865,0.0437895,0.0679398,0.0469833,0.04585155,0.0962583,0.05900205,0.051966,0.05043135,0.13916985,0.056796,0.15118305,0.05822475,0.06720105,0.44427,0.08020965,0.5584701,0.08296725,0.07884885,0.12579195,0.0868332,0.0849003,0.08665095,0.0850302,0.0863904,0.09147345]},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","run_index":1,"reward":17.600000000000005,"baseline_reward":8.2,"reference_reward":40.0,"gain":9.400000000000006,"normalized_reward":0.35009671179883967,"normalized_gain":0.2955974842767297,"cost_usd":2.6915262,"latency_seconds":5.27729,"instance_count":40,"reward_curve":[0.6666666666666667,0.8666666666666667,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.8,0.6666666666666667,0.0,0.0,0.0,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.7333333333333334,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.6,0.6,0.5333333333333333,0.4666666666666667,0.4666666666666667,0.0,0.6,0.4,0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.6,0.5333333333333333,0.0],"gain_curve":[0.6666666666666667,0.8666666666666667,0.33333333333333337,0.2666666666666667,-0.5333333333333333,-0.4666666666666667,-0.4666666666666667,0.8,0.06666666666666676,-0.4,-0.4666666666666667,-0.6,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.0,0.9333333333333333,0.0,-0.6666666666666667,0.9333333333333333,0.0,0.7333333333333334,0.0,-0.5333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,-0.5333333333333333,-0.6,0.33333333333333337,0.4,0.9333333333333333],"cost_curve":[0.0761664,0.03003015,0.02368755,0.0370503,0.0223956,0.14919165,0.05559075,0.06524445,0.1127643,0.05573205,0.06466935,0.0827298,0.04623255,0.0650475,0.04781265,0.0462219,0.0546297,0.0536145,0.05001405,0.0534168,0.0506718,0.13583595,0.06264645,0.05709225,0.0566016,0.0619137,0.05708565,0.0913527,0.0608052,0.06296745,0.0615543,0.06314295,0.06707595,0.06539955,0.0652311,0.10917375,0.15588225,0.06969465,0.07310205,0.0720549]},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","run_index":2,"reward":22.133333333333336,"baseline_reward":8.2,"reference_reward":40.0,"gain":13.933333333333337,"normalized_reward":0.48162475822050305,"normalized_gain":0.4381551362683439,"cost_usd":3.9094707,"latency_seconds":5.845208,"instance_count":40,"reward_curve":[0.0,0.0,0.7333333333333334,0.9333333333333333,0.8,0.8,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.7333333333333334,0.6,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.6,0.6,0.5333333333333333,0.4666666666666667,0.4666666666666667,0.0,0.6,0.4,0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.6,0.5333333333333333,0.0],"gain_curve":[0.0,0.0,0.13333333333333341,0.33333333333333337,0.2666666666666667,0.33333333333333337,0.4666666666666667,0.0,-0.6,0.5333333333333333,0.4666666666666667,0.2666666666666667,0.9333333333333333,0.9333333333333333,0.7333333333333334,0.6,0.9333333333333333,0.9333333333333333,0.2666666666666666,0.8,0.0,0.0,0.0,0.4,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.4,0.33333333333333337,-0.6,-0.5333333333333333,0.9333333333333333],"cost_curve":[0.1042605,0.2304114,0.08110785,0.0307167,0.06407295,0.0653157,0.03668895,0.03866595,0.1736445,0.0505203,0.0412623,0.0700539,0.0462453,0.0502758,0.11901345,0.1848576,0.0527595,0.05215305,0.0555762,0.1098795,0.1828788,0.34114635,0.2830611,0.07734765,0.07154625,0.070317,0.08113245,0.07616835,0.0753609,0.0771009,0.12134055,0.08069925,0.0790824,0.0796023,0.08043795,0.0877878,0.0815316,0.1334436,0.08682375,0.08518035]},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","run_index":3,"reward":23.266666666666673,"baseline_reward":8.2,"reference_reward":40.0,"gain":15.066666666666674,"normalized_reward":0.5145067698259189,"normalized_gain":0.4737945492662476,"cost_usd":3.40092015,"latency_seconds":5.704085,"instance_count":40,"reward_curve":[0.0,0.8666666666666667,0.5333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8,0.8,0.8,0.8,0.6,0.8,0.7333333333333334,0.9333333333333333,0.0,0.0,0.0,0.7333333333333334,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.6,0.6,0.5333333333333333,0.4666666666666667,0.4666666666666667,0.0,0.6,0.4,0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.6,0.5333333333333333,0.0],"gain_curve":[0.0,0.8666666666666667,-0.06666666666666665,-0.6,-0.5333333333333333,0.4666666666666667,0.4666666666666667,0.9333333333333333,0.33333333333333337,0.4,0.33333333333333337,0.20000000000000007,0.8,0.6,0.8,0.7333333333333334,0.9333333333333333,0.0,-0.6666666666666667,0.0,0.7333333333333334,0.0,0.9333333333333333,0.4,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,-0.5333333333333333,0.33333333333333337,0.33333333333333337,0.4,0.9333333333333333],"cost_curve":[0.10096605,0.08258565,0.1630665,0.0329199,0.0444186,0.0337134,0.0317628,0.031917,0.03320715,0.0699075,0.07572375,0.09506535,0.1008003,0.1543659,0.0911451,0.156012,0.04864035,0.04968615,0.08026635,0.0854871,0.1408866,0.2385762,0.06168465,0.0609813,0.0612612,0.09580695,0.10245795,0.0663207,0.06846615,0.0692229,0.0688512,0.1183305,0.07165665,0.07124295,0.0760941,0.15893385,0.0765297,0.07626435,0.0777456,0.07794975]},{"run_name":"claude-code-sonnet-4.6","task":"database_exploration","run_index":4,"reward":22.733333333333338,"baseline_reward":8.2,"reference_reward":40.0,"gain":14.533333333333339,"normalized_reward":0.49903288201160556,"normalized_gain":0.45702306079664584,"cost_usd":2.9655477,"latency_seconds":5.135524,"instance_count":40,"reward_curve":[0.6,0.6,0.0,0.9333333333333333,0.9333333333333333,0.8,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.7333333333333334,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.7333333333333334,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.6,0.6,0.5333333333333333,0.4666666666666667,0.4666666666666667,0.0,0.6,0.4,0.4666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.6,0.5333333333333333,0.0],"gain_curve":[0.6,0.6,-0.6,0.33333333333333337,0.4,0.33333333333333337,0.4,0.9333333333333333,0.33333333333333337,0.4666666666666667,0.2666666666666667,-0.6,0.0,0.9333333333333333,0.0,0.8666666666666667,0.8666666666666667,0.9333333333333333,0.2666666666666666,0.9333333333333333,0.0,0.7333333333333334,0.0,-0.5333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.9333333333333333,0.4,-0.6,0.33333333333333337,0.4,0.0],"cost_curve":[0.1264326,0.1122939,0.0498909,0.0270495,0.02558025,0.0550665,0.0436986,0.0293217,0.0320907,0.0477378,0.08604555,0.05418345,0.0811596,0.0380424,0.04347195,0.0654579,0.0914127,0.04402485,0.0453651,0.04796325,0.0774081,0.1250838,0.07833075,0.0586674,0.05592075,0.0534975,0.0545187,0.05629155,0.4012155,0.0636672,0.07065225,0.06549435,0.1139919,0.07096395,0.0700569,0.07066515,0.1075785,0.07260015,0.07381005,0.07884405]},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","run_index":0,"reward":363.79999999999995,"baseline_reward":284.5,"reference_reward":1138.5,"gain":79.29999999999995,"normalized_reward":0.2290775201512588,"normalized_gain":0.0928571428571428,"cost_usd":7.99320525,"latency_seconds":7.037621,"instance_count":120,"reward_curve":[-2.5,-0.5,-1.0,9.5,-2.4,2.5,6.5,-8.0,36.0,-7.0,15.5,-30.0,5.4,18.0,-7.0,-5.4,32.0,-2.4,12.2,36.0,-1.0,-1.0,-0.5,1.0,3.0,-3.0,0.5,-1.0,0.5,-0.5,-6.0,0.5,-1.0,-2.0,0.5,0.5,-2.0,-0.5,1.0,93.0,-1.0,-1.0,-0.5,3.0,1.0,-5.0,94.0,-3.0,-0.5,4.0,32.0,4.9,-8.0,-8.0,5.4,32.0,5.4,-18.0,-3.5,-2.4,-0.5,0.5,0.5,-5.4,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,15.0,0.5,-0.5,-7.0,3.5,-0.5,0.5,-0.5,1.0,1.0,0.5,28.0,3.0,3.0,-0.5,1.0,8.0,1.0,-0.5,0.5,1.0,-0.5,-1.0,0.5,0.5,-0.5,-0.5,4.0,3.0,0.5,-3.0,-0.5,-1.0,-1.0,-1.0,0.5,-100.0,-1.0,1.0,-1.0,-0.5,-3.0,-1.0,-0.5,-1.0,95.0,3.5,0.5,-1.0],"baseline_reward_curve":[-2.4,2.5,-2.0,9.5,-5.0,2.5,8.0,-2.4,14.5,-14.5,8.0,-12.5,6.0,21.0,-8.0,-5.0,5.5,-5.6,13.5,36.0,2.0,-11.5,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-4.0,0.5,0.5,-1.0,0.0,1.0,93.5,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-0.5,-0.5,-1.0,12.4,2.5,-9.5,-9.0,5.0,12.0,2.4,-4.8,-2.5,-2.4,1.0,0.5,0.5,-2.2,1.0,1.0,1.0,1.0,1.0,-4.5,-0.5,17.0,0.5,-6.0,-4.5,6.0,-0.5,0.5,-4.8,5.2,-5.0,0.5,12.5,3.0,3.5,-5.4,1.0,20.5,1.0,-0.5,0.5,1.0,-0.5,-2.5,0.5,0.5,1.0,-1.0,-3.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,-1.0,0.5,-28.0,-1.0,1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[-0.10000000000000009,-3.0,1.0,0.0,2.6,0.0,-1.5,-5.6,21.5,7.5,7.5,-17.5,-0.5999999999999996,-3.0,1.0,-0.40000000000000036,26.5,3.1999999999999997,-1.3000000000000007,0.0,-3.0,10.5,0.0,0.0,2.0,-2.5,0.0,-0.5,0.0,0.0,-8.0,0.0,0.0,2.0,0.0,0.0,-1.0,-0.5,0.0,-0.5,0.0,1.0,0.0,2.0,2.0,-3.0,84.0,-2.5,0.0,5.0,19.6,2.4000000000000004,1.5,1.0,0.40000000000000036,20.0,3.0000000000000004,-13.2,-1.0,0.0,-1.5,0.0,0.0,-3.2,-1.5,0.0,-1.5,-1.5,0.0,4.0,0.0,-2.0,0.0,5.5,-2.5,-2.5,0.0,0.0,4.3,-4.2,6.0,0.0,15.5,0.0,-0.5,4.9,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,-1.5,0.5,7.0,0.0,0.0,-2.0,0.5,-2.0,0.0,0.0,0.0,-72.0,0.0,0.0,-3.0,-1.5,-5.0,-4.0,0.5,-3.0,-5.0,0.0,0.0,3.0],"cost_curve":[0.0670482,0.03515235,0.0890361,0.0664893,0.0695727,0.066726,0.07591755,0.07554435,0.0853281,0.09375585,0.09624705,0.0991887,0.10088325,0.10291785,0.1080186,0.11498505,0.1161858,0.11839245,0.12542595,0.1222842,0.0329256,0.0312768,0.03105345,0.0336036,0.067224,0.0664977,0.0,0.06809895,0.0,0.0359985,0.072624,0.0,0.07262775,0.1092963,0.0,0.0,0.10955475,0.0389337,0.07986825,0.1882422,0.1142232,0.038433,0.03839445,0.08178435,0.0406173,0.1233756,0.20259195,0.0827478,0.0414684,0.13363875,0.1818051,0.1836039,0.1907661,0.1936029,0.2010738,0.20192775,0.20390535,0.2085492,0.21137985,0.2142846,0.05405265,0.0,0.0,0.286989,0.0151293,0.01079865,0.0102021,0.00980235,0.0112752,0.01066005,0.01126965,0.0641586,0.0,0.0735213,0.1275387,0.0471132,0.01470525,0.0,0.0146661,0.0155121,0.0156714,0.0,0.0684798,0.03541185,0.03493455,0.01792005,0.0179226,0.077286,0.0197253,0.018963,0.0,0.01961925,0.0193374,0.08205105,0.0,0.0,0.0249651,0.0222168,0.06796275,0.0465675,0.0,0.0468159,0.02339325,0.09368115,0.0497175,0.02476005,0.0,0.1547394,0.02640255,0.0267639,0.02665065,0.02653455,0.05416545,0.0283686,0.02817375,0.02799525,0.14368695,0.0593826,0.0,0.03044685]},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","run_index":1,"reward":381.6,"baseline_reward":284.5,"reference_reward":1138.5,"gain":97.10000000000002,"normalized_reward":0.24679072544531797,"normalized_gain":0.1137002341920375,"cost_usd":9.65474985,"latency_seconds":6.238674,"instance_count":120,"reward_curve":[38.0,13.5,2.4,-2.3,7.5,3.5,-2.4,-1.0,11.4,11.4,-3.5,36.0,2.4,-5.3,16.0,-15.0,-7.0,-1.0,-3.0,2.3,-1.0,-0.5,0.0,1.0,-0.5,0.5,94.0,-5.0,0.0,0.5,-1.0,-1.0,0.5,-3.0,-1.0,0.5,-1.0,2.0,1.0,0.5,-5.0,-0.5,-5.0,1.0,-1.0,94.0,1.0,3.0,-0.5,1.0,-3.5,34.0,-8.0,-8.0,-8.0,12.4,4.8,2.3,36.0,-1.0,12.8,1.0,-4.9,5.5,-1.0,-0.5,0.5,0.5,-2.3,3.0,0.5,-2.3,1.0,-0.5,-0.5,-2.3,1.0,3.0,-2.3,-0.5,0.5,0.5,1.0,0.5,-0.5,3.0,13.0,-2.3,-0.5,1.0,0.5,1.0,1.0,1.0,2.3,-3.0,0.5,-1.0,3.0,-1.0,-95.0,-0.5,-1.0,0.5,-1.0,2.0,-1.0,-1.0,0.5,3.0,-1.0,-2.0,96.0,3.0,-3.0,0.5,3.0,3.0,3.0,1.0],"baseline_reward_curve":[-2.4,2.5,-2.0,9.5,-5.0,2.5,8.0,-2.4,14.5,-14.5,8.0,-12.5,6.0,21.0,-8.0,-5.0,5.5,-5.6,13.5,36.0,2.0,-11.5,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-4.0,0.5,0.5,-1.0,0.0,1.0,93.5,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-0.5,-0.5,-1.0,12.4,2.5,-9.5,-9.0,5.0,12.0,2.4,-4.8,-2.5,-2.4,1.0,0.5,0.5,-2.2,1.0,1.0,1.0,1.0,1.0,-4.5,-0.5,17.0,0.5,-6.0,-4.5,6.0,-0.5,0.5,-4.8,5.2,-5.0,0.5,12.5,3.0,3.5,-5.4,1.0,20.5,1.0,-0.5,0.5,1.0,-0.5,-2.5,0.5,0.5,1.0,-1.0,-3.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,-1.0,0.5,-28.0,-1.0,1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[40.4,11.0,4.4,-11.8,12.5,1.0,-10.4,1.4,-3.0999999999999996,25.9,-11.5,48.5,-3.6,-26.3,24.0,-10.0,-12.5,4.6,-16.5,-33.7,-3.0,11.0,0.5,0.0,-1.5,1.0,93.5,-4.5,-0.5,1.0,-3.0,-1.5,1.5,1.0,-1.5,0.0,0.0,2.0,0.0,-93.0,-4.0,1.5,-4.5,0.0,0.0,96.0,-9.0,3.5,0.0,2.0,-15.9,31.5,1.5,1.0,-13.0,0.40000000000000036,2.4,7.1,38.5,1.4,11.8,0.5,-5.4,7.7,-2.0,-1.5,-0.5,-0.5,-3.3,7.5,1.0,-19.3,0.5,5.5,4.0,-8.3,1.5,2.5,2.5,-5.7,5.5,0.0,-11.5,-2.5,-4.0,8.4,12.0,-22.8,-1.5,1.5,0.0,0.0,1.5,3.5,1.7999999999999998,-3.5,-0.5,0.0,6.0,-4.0,-95.5,0.5,0.0,-0.5,0.0,3.0,-1.5,27.0,1.5,2.0,-3.0,-3.0,94.0,0.0,-2.0,-1.5,-97.0,-0.5,2.5,5.0],"cost_curve":[0.0698436,0.10096845,0.0530214,0.05639295,0.0612141,0.06603555,0.07513125,0.068913,0.07884615,0.08107005,0.08136585,0.0861033,0.089703,0.0945603,0.09940125,0.10925115,0.10660125,0.10885125,0.11279355,0.11906865,0.03119115,0.0299391,0.12346035,0.0664221,0.03349185,0.0,0.162081,0.1022778,0.137799,0.0,0.03561945,0.06918435,0.0,0.0713124,0.0732006,0.0,0.10800795,0.1120296,0.075447,0.0,0.1157772,0.0385182,0.11789235,0.08568195,0.08439735,0.2037924,0.0885021,0.17136255,0.04301955,0.04339515,0.17751195,0.18436755,0.1866561,0.1898298,0.19389915,0.200259,0.20041695,0.2048823,0.21176145,0.2108337,0.21597225,0.17765475,0.04322445,0.0498768,0.05166435,0.01441605,0.0,0.0,0.05915325,0.03029625,0.0,0.0619251,0.0317967,0.0155238,0.0153426,0.06995445,0.01716015,0.03495765,0.0773331,0.0214563,0.0,0.0,0.0196734,0.0,0.0191643,0.03879765,0.0864771,0.0901779,0.0218706,0.0440511,0.0,0.0226161,0.02250375,0.0227676,0.09622485,0.15377055,0.0,0.0514266,0.07914045,0.053355,0.1664076,0.03073725,0.43226685,0.0,0.02981835,0.0893943,0.06067935,0.1236747,0.0,0.06497415,0.03215115,0.1314921,0.23475105,0.1398711,0.07116585,0.0,0.0730956,0.07299345,0.1504215,0.03777405]},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","run_index":2,"reward":403.6,"baseline_reward":284.5,"reference_reward":1138.5,"gain":119.10000000000002,"normalized_reward":0.2686834510896607,"normalized_gain":0.13946135831381737,"cost_usd":8.92681995,"latency_seconds":6.249654,"instance_count":120,"reward_curve":[36.5,-3.5,-2.5,15.1,-2.4,15.1,-3.0,-15.1,-34.1,1.0,35.1,-1.0,-1.0,34.1,3.5,2.4,8.0,-1.0,15.1,18.0,2.0,-0.5,1.0,-2.8,-0.5,0.5,1.0,-2.0,0.5,0.5,-1.0,1.0,1.0,-1.0,-1.0,0.5,-0.5,-1.0,-1.0,-1.0,-0.5,94.0,-0.5,10.0,-0.5,0.5,-1.0,-1.0,-0.5,-1.0,16.0,2.4,-3.5,-5.6,-1.0,2.4,34.1,-3.0,15.1,-3.5,0.5,7.0,0.5,-0.5,0.5,1.0,-0.5,-5.6,-0.5,-0.5,1.0,13.4,1.0,-2.2,-0.5,3.0,-1.0,1.0,-1.0,-0.5,1.0,1.0,0.5,4.7,26.5,0.5,0.5,-1.0,-2.2,2.0,-0.5,-4.9,-0.5,0.5,-3.0,-0.5,1.0,-1.0,2.0,-1.0,0.5,3.0,-20.0,1.0,-4.0,1.0,2.0,0.5,-3.0,95.0,5.0,1.0,-1.0,-1.0,3.0,0.5,2.0,0.5,3.0,4.0],"baseline_reward_curve":[-2.4,2.5,-2.0,9.5,-5.0,2.5,8.0,-2.4,14.5,-14.5,8.0,-12.5,6.0,21.0,-8.0,-5.0,5.5,-5.6,13.5,36.0,2.0,-11.5,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-4.0,0.5,0.5,-1.0,0.0,1.0,93.5,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-0.5,-0.5,-1.0,12.4,2.5,-9.5,-9.0,5.0,12.0,2.4,-4.8,-2.5,-2.4,1.0,0.5,0.5,-2.2,1.0,1.0,1.0,1.0,1.0,-4.5,-0.5,17.0,0.5,-6.0,-4.5,6.0,-0.5,0.5,-4.8,5.2,-5.0,0.5,12.5,3.0,3.5,-5.4,1.0,20.5,1.0,-0.5,0.5,1.0,-0.5,-2.5,0.5,0.5,1.0,-1.0,-3.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,-1.0,0.5,-28.0,-1.0,1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[38.9,-6.0,-0.5,5.6,2.6,12.6,-11.0,-12.7,-48.6,15.5,27.1,11.5,-7.0,13.100000000000001,11.5,7.4,2.5,4.6,1.5999999999999996,-18.0,0.0,11.0,1.5,-3.8,-1.5,1.0,0.5,-1.5,0.0,1.0,-3.0,0.5,2.0,3.0,-1.5,0.0,0.5,-1.0,-2.0,-94.5,0.5,96.0,0.0,9.0,0.5,2.5,-11.0,-0.5,0.0,0.0,3.5999999999999996,-0.10000000000000009,6.0,3.4000000000000004,-6.0,-9.6,31.700000000000003,1.7999999999999998,17.6,-1.1,-0.5,6.5,0.0,1.7000000000000002,-0.5,0.0,-1.5,-6.6,-1.5,4.0,1.5,-3.5999999999999996,0.5,3.8,4.0,-3.0,-0.5,0.5,3.8,-5.7,6.0,0.5,-12.0,1.7000000000000002,23.0,5.9,-0.5,-21.5,-3.2,2.5,-1.0,-5.9,0.0,3.0,-3.5,-1.0,0.0,0.0,5.0,-4.0,0.0,4.0,-19.0,0.0,-3.0,2.0,1.5,28.5,-2.0,94.0,3.0,0.0,-3.0,-4.0,4.0,-1.5,-98.0,-3.0,2.5,8.0],"cost_curve":[0.05182035,0.10207035,0.06686895,0.0581538,0.0623427,0.0635442,0.0655005,0.0729045,0.07658865,0.0851913,0.1122153,0.08518065,0.08908755,0.0966864,0.09909585,0.10115175,0.1071621,0.1098705,0.11358585,0.1175274,0.10081095,0.03028695,0.0301008,0.06460005,0.032535,0.0,0.1000005,0.10078605,0.0,0.0,0.0335691,0.0671091,0.0681051,0.0342135,0.1025142,0.0,0.03527865,0.07095015,0.07249305,0.11463345,0.03718305,0.19024695,0.03957315,0.19836795,0.0398499,0.0,0.0402837,0.04003005,0.0403758,0.08181885,0.1678875,0.16937265,0.1743147,0.17804505,0.17997495,0.18462315,0.1934361,0.1941453,0.19599,0.20030295,0.0,0.2044734,0.0,0.0518604,0.0,0.1038117,0.05229255,0.2127087,0.05338875,0.05307615,0.05372115,0.1859493,0.0382995,0.0510354,0.01520895,0.0253023,0.0529719,0.01572195,0.0577419,0.0149271,0.014496,0.0303474,0.0,0.0670461,0.082278,0.0,0.0,0.0769284,0.07747365,0.0619623,0.0201879,0.08985915,0.02490135,0.0,0.09177315,0.02507595,0.07138425,0.0240066,0.04829205,0.02437725,0.0,0.10328025,0.1397352,0.0309336,0.1707099,0.05849265,0.0587889,0.0,0.0597447,0.15255885,0.0946056,0.0954246,0.06442815,0.0665982,0.13546785,0.0,0.1051935,0.0,0.0702414,0.107379]},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","run_index":3,"reward":330.3,"baseline_reward":284.5,"reference_reward":1138.5,"gain":45.80000000000001,"normalized_reward":0.19574086973828245,"normalized_gain":0.05362997658079627,"cost_usd":8.6004309,"latency_seconds":5.747182,"instance_count":120,"reward_curve":[6.0,8.5,31.0,37.0,-7.0,-16.0,8.0,-3.5,-36.0,16.0,-3.5,4.8,-3.5,2.3,-3.0,-2.3,36.0,-5.3,18.0,41.0,-0.5,-0.5,-1.0,1.0,0.0,-1.0,-1.0,-0.5,-1.0,-1.0,1.0,0.5,0.5,1.0,0.5,-0.5,2.0,-1.0,-0.5,-5.0,94.0,-0.5,-3.0,-0.5,0.5,2.0,1.0,-3.0,0.5,1.0,-3.5,-3.5,33.0,36.0,4.8,2.3,-3.5,-18.0,31.0,-1.0,1.0,3.0,0.5,33.0,3.0,1.0,-10.8,-0.5,-0.5,0.5,1.0,1.0,0.5,-1.0,1.0,-2.3,-0.5,-0.5,0.5,-0.5,15.0,0.5,0.5,-2.2,-1.0,-0.5,1.0,1.0,0.5,-2.3,-0.5,-0.5,2.3,-1.0,1.0,-100.0,95.0,2.0,0.5,0.5,-1.0,1.0,-3.0,0.5,3.0,-3.0,3.0,-1.0,3.0,-0.5,0.5,-1.0,-1.0,-0.5,1.0,1.0,-1.0,-3.0,-1.0,1.0],"baseline_reward_curve":[-2.4,2.5,-2.0,9.5,-5.0,2.5,8.0,-2.4,14.5,-14.5,8.0,-12.5,6.0,21.0,-8.0,-5.0,5.5,-5.6,13.5,36.0,2.0,-11.5,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-4.0,0.5,0.5,-1.0,0.0,1.0,93.5,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-0.5,-0.5,-1.0,12.4,2.5,-9.5,-9.0,5.0,12.0,2.4,-4.8,-2.5,-2.4,1.0,0.5,0.5,-2.2,1.0,1.0,1.0,1.0,1.0,-4.5,-0.5,17.0,0.5,-6.0,-4.5,6.0,-0.5,0.5,-4.8,5.2,-5.0,0.5,12.5,3.0,3.5,-5.4,1.0,20.5,1.0,-0.5,0.5,1.0,-0.5,-2.5,0.5,0.5,1.0,-1.0,-3.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,-1.0,0.5,-28.0,-1.0,1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[8.4,6.0,33.0,27.5,-2.0,-18.5,0.0,-1.1,-50.5,30.5,-11.5,17.3,-9.5,-18.7,5.0,2.7,30.5,0.2999999999999998,4.5,5.0,-2.5,11.0,-0.5,0.0,-1.0,-0.5,-1.5,0.0,-1.5,-0.5,-1.0,0.0,1.5,5.0,0.0,-1.0,3.0,-1.0,-1.5,-98.5,95.0,1.5,-2.5,-1.5,1.5,4.0,-9.0,-2.5,1.0,2.0,-15.9,-6.0,42.5,45.0,-0.20000000000000018,-9.7,-5.9,-13.2,33.5,1.4,0.0,2.5,0.0,35.2,2.0,0.0,-11.8,-1.5,-1.5,5.0,1.5,-16.0,0.0,5.0,5.5,-8.3,0.0,-1.0,5.3,-5.7,20.0,0.0,-12.0,-5.2,-4.5,4.9,0.0,-19.5,-0.5,-1.7999999999999998,-1.0,-1.5,2.8,1.5,0.5,-100.5,94.0,3.0,3.5,-2.5,-1.5,2.0,-2.0,-0.5,4.0,-2.0,2.5,27.0,4.0,-1.5,-1.5,-2.0,-3.0,-3.5,2.0,-1.0,-101.0,-6.5,-1.5,5.0],"cost_curve":[0.0694161,0.11789655,0.06724005,0.05590065,0.06417375,0.07129275,0.06811215,0.0711777,0.07660905,0.0795999,0.0811389,0.0875883,0.088659,0.09137595,0.09550965,0.09832335,0.10242645,0.10682055,0.11190765,0.11429325,0.02971695,0.02889045,0.08709765,0.0618153,0.12712935,0.03318105,0.03160365,0.0317532,0.06816855,0.0682539,0.03434835,0.0,0.0,0.0675984,0.0,0.03393015,0.13778805,0.03532125,0.0348504,0.0746172,0.18468765,0.11653365,0.42345765,0.037557,0.0,0.1195566,0.07896195,0.07954755,0.0,0.0801081,0.1635339,0.16757775,0.1776336,0.17612805,0.17960955,0.1815105,0.18588285,0.18975435,0.19524,0.19799055,0.0997407,0.09954765,0.0,0.20403045,0.10349295,0.10345965,0.2097753,0.05357025,0.0529524,0.0,0.05364345,0.1114029,0.0,0.0452679,0.01235355,0.0484029,0.01451445,0.01192125,0.0,0.01216275,0.05446545,0.0,0.0,0.0578298,0.0571527,0.0150861,0.0317613,0.0159891,0.0,0.06834735,0.0169623,0.01679235,0.07085535,0.0742863,0.05912415,0.1228392,0.10816665,0.09304725,0.0,0.0,0.0237048,0.04689285,0.0487197,0.0,0.049857,0.05003985,0.05221185,0.05150235,0.1358196,0.0270927,0.0,0.0280569,0.0551562,0.02788425,0.02871795,0.0859593,0.02917545,0.0596217,0.02978925,0.03048855]},{"run_name":"claude-code-sonnet-4.6","task":"exploitable_poker","run_index":4,"reward":235.8,"baseline_reward":284.5,"reference_reward":1138.5,"gain":-48.69999999999999,"normalized_reward":0.1017016618569012,"normalized_gain":-0.05702576112412177,"cost_usd":8.0861172,"latency_seconds":7.771074,"instance_count":120,"reward_curve":[18.0,-5.4,19.0,38.0,-4.0,33.0,9.0,33.0,2.4,-8.0,-38.0,-1.0,34.0,48.0,-3.5,-3.5,-4.0,2.4,18.0,-1.0,1.0,0.5,-6.0,1.0,-1.0,2.0,-1.0,-1.0,-5.0,10.0,-3.5,-0.5,0.5,-0.5,-2.0,-2.0,-0.5,2.0,-1.0,-0.5,0.5,-0.5,-0.5,0.5,0.5,-1.0,1.0,-1.0,1.0,-0.5,-4.0,-9.0,-9.0,-18.0,16.0,2.4,-1.0,35.0,6.0,-0.5,1.0,3.5,1.0,0.5,-0.5,1.0,1.0,1.0,1.0,0.5,-17.0,3.5,1.0,1.0,36.0,-0.5,1.0,0.5,0.5,-0.5,0.5,-0.5,1.0,1.0,-2.5,0.5,3.0,-0.5,8.0,1.0,0.5,3.0,3.0,-0.5,-0.5,3.5,1.0,1.0,1.0,-3.5,0.5,4.5,6.0,-1.0,-6.0,-1.0,18.0,0.5,0.5,-3.5,-1.0,-1.0,3.5,3.5,0.5,-1.0,1.0,-36.0,-1.0,-3.5],"baseline_reward_curve":[-2.4,2.5,-2.0,9.5,-5.0,2.5,8.0,-2.4,14.5,-14.5,8.0,-12.5,6.0,21.0,-8.0,-5.0,5.5,-5.6,13.5,36.0,2.0,-11.5,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-4.0,0.5,0.5,-1.0,0.0,1.0,93.5,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-0.5,-0.5,-1.0,12.4,2.5,-9.5,-9.0,5.0,12.0,2.4,-4.8,-2.5,-2.4,1.0,0.5,0.5,-2.2,1.0,1.0,1.0,1.0,1.0,-4.5,-0.5,17.0,0.5,-6.0,-4.5,6.0,-0.5,0.5,-4.8,5.2,-5.0,0.5,12.5,3.0,3.5,-5.4,1.0,20.5,1.0,-0.5,0.5,1.0,-0.5,-2.5,0.5,0.5,1.0,-1.0,-3.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,-1.0,0.5,-28.0,-1.0,1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[20.4,-7.9,21.0,28.5,1.0,30.5,1.0,35.4,-12.1,6.5,-46.0,11.5,28.0,27.0,4.5,1.5,-9.5,8.0,4.5,-37.0,-1.0,12.0,-5.5,0.0,-2.0,2.5,-1.5,-0.5,-5.5,10.5,-5.5,-1.0,1.5,3.5,-2.5,-2.5,0.5,2.0,-2.0,-94.0,1.5,1.5,0.0,-0.5,1.5,1.0,-9.0,-0.5,1.5,0.5,-16.4,-11.5,0.5,-9.0,11.0,-9.6,-3.4,39.8,8.5,1.9,0.0,3.0,0.5,2.7,-1.5,0.0,0.0,0.0,0.0,5.0,-16.5,-13.5,0.5,7.0,40.5,-6.5,1.5,0.0,5.3,-5.7,5.5,-1.0,-11.5,-2.0,-6.0,5.9,2.0,-21.0,7.0,1.5,0.0,2.0,3.5,2.0,-1.0,3.0,0.0,2.0,4.0,-6.5,0.0,5.5,7.0,-2.0,-5.0,0.0,17.5,28.5,1.5,-4.5,-3.0,-2.0,1.5,0.5,1.5,-3.0,-99.0,-39.5,-1.5,0.5],"cost_curve":[0.07394595,0.06990645,0.07078755,0.05950215,0.07250325,0.07161015,0.0781368,0.0812007,0.08523465,0.10441815,0.09782085,0.09148725,0.11295195,0.10081455,0.1190823,0.11263575,0.1137435,0.1222575,0.12559035,0.1313697,0.06692445,0.0,0.0698889,0.0695205,0.07278045,0.10978665,0.03677565,0.03701265,0.28856565,0.21033405,0.08604405,0.0401373,0.0,0.0391881,0.12640545,0.12500475,0.04510335,0.19358805,0.13046265,0.0438807,0.0,0.0435045,0.043947,0.0,0.0,0.10052115,0.04835655,0.0915816,0.0486966,0.0492963,0.1990077,0.2035377,0.20677245,0.2109834,0.21640185,0.21294165,0.21702915,0.2527854,0.05349045,0.0138996,0.0287757,0.02793105,0.0151566,0.0,0.0142419,0.0140853,0.0140046,0.0143931,0.01443255,0.0,0.0666105,0.0600468,0.01812585,0.01698195,0.0739944,0.01882395,0.0185718,0.0,0.0,0.01864185,0.0,0.01840245,0.0185763,0.01899765,0.0803316,0.0,0.0422124,0.02045535,0.0962817,0.02344185,0.0,0.0454923,0.04691685,0.02314305,0.02271825,0.05095815,0.0246363,0.0243723,0.0248976,0.0505041,0.0,0.08051145,0.0827394,0.02732865,0.05532855,0.0276921,0.1469514,0.0,0.0,0.05889075,0.02938845,0.1197015,0.0621396,0.06256425,0.0,0.03185115,0.0318249,0.2000355,0.0355962,0.06726135]},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","run_index":0,"reward":9.1481,"baseline_reward":5.0358,"reference_reward":12.0,"gain":4.112299999999999,"normalized_reward":0.5545785371796272,"normalized_gain":0.5904913701501966,"cost_usd":2.39623785,"latency_seconds":14.830743,"instance_count":12,"reward_curve":[0.7264,0.7062,0.7908,0.7752,0.7486,0.7658,0.7558,0.813,0.7473,0.7492,0.7953,0.7745],"baseline_reward_curve":[0.2524,0.395,0.4465,0.6928,0.2711,0.4154,0.7127,0.3202,0.3947,0.5094,0.285,0.3406],"gain_curve":[0.47400000000000003,0.31120000000000003,0.34429999999999994,0.08240000000000003,0.47750000000000004,0.35040000000000004,0.04310000000000003,0.49279999999999996,0.35259999999999997,0.2398,0.5103,0.43389999999999995],"cost_curve":[0.2264799,0.2098539,0.1680111,0.17020815,0.1907916,0.1809465,0.18702555,0.1951938,0.20469525,0.20758995,0.2233416,0.23210055]},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","run_index":1,"reward":10.4876,"baseline_reward":5.0358,"reference_reward":12.0,"gain":5.4518,"normalized_reward":0.7637871522951256,"normalized_gain":0.7828321989603976,"cost_usd":2.08361595,"latency_seconds":10.517387,"instance_count":12,"reward_curve":[0.6767,0.6472,0.9217,0.9301,0.8539,0.9326,0.8975,0.9463,0.8632,0.9131,0.9491,0.9562],"baseline_reward_curve":[0.2524,0.395,0.4465,0.6928,0.2711,0.4154,0.7127,0.3202,0.3947,0.5094,0.285,0.3406],"gain_curve":[0.42429999999999995,0.2522,0.47519999999999996,0.23730000000000007,0.5828,0.5172,0.18479999999999996,0.6261000000000001,0.46849999999999997,0.40370000000000006,0.6641000000000001,0.6156],"cost_curve":[0.2118291,0.17277555,0.1538979,0.1381719,0.14379705,0.1638558,0.1615698,0.1706631,0.18022575,0.18892605,0.1919328,0.20597115]},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","run_index":2,"reward":9.729700000000001,"baseline_reward":5.0358,"reference_reward":12.0,"gain":4.693900000000001,"normalized_reward":0.6454152154559797,"normalized_gain":0.6740041928721175,"cost_usd":2.01194805,"latency_seconds":11.087779,"instance_count":12,"reward_curve":[0.6469,0.686,0.836,0.8966,0.8274,0.8024,0.81,0.8903,0.7996,0.8237,0.8687,0.8421],"baseline_reward_curve":[0.2524,0.395,0.4465,0.6928,0.2711,0.4154,0.7127,0.3202,0.3947,0.5094,0.285,0.3406],"gain_curve":[0.3945,0.29100000000000004,0.38949999999999996,0.20379999999999998,0.5563,0.387,0.09730000000000005,0.5701,0.4049,0.3143,0.5837000000000001,0.5015],"cost_curve":[0.2111016,0.16541625,0.1274682,0.1315533,0.15137595,0.14874855,0.15726495,0.16634385,0.17503035,0.182964,0.1930815,0.20159955]},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","run_index":3,"reward":9.383600000000001,"baseline_reward":5.0358,"reference_reward":12.0,"gain":4.347800000000001,"normalized_reward":0.5913598950442784,"normalized_gain":0.6243071709600531,"cost_usd":2.4198597,"latency_seconds":12.770636,"instance_count":12,"reward_curve":[-0.079,0.7455,0.8325,0.793,0.8032,0.8901,0.8602,0.9348,0.8452,0.9164,0.9052,0.9365],"baseline_reward_curve":[0.2524,0.395,0.4465,0.6928,0.2711,0.4154,0.7127,0.3202,0.3947,0.5094,0.285,0.3406],"gain_curve":[-0.33140000000000003,0.35050000000000003,0.386,0.10020000000000007,0.5321,0.4747,0.14749999999999996,0.6146,0.45049999999999996,0.40700000000000003,0.6202000000000001,0.5959],"cost_curve":[0.19582935,0.2453193,0.17725605,0.21227895,0.165258,0.1781964,0.1889685,0.19164225,0.20235165,0.21186255,0.21984915,0.23104755]},{"run_name":"claude-code-sonnet-4.6","task":"sales_prediction","run_index":4,"reward":9.1158,"baseline_reward":5.0358,"reference_reward":12.0,"gain":4.08,"normalized_reward":0.5495337904321614,"normalized_gain":0.5858533643490997,"cost_usd":2.1605304,"latency_seconds":11.538163,"instance_count":12,"reward_curve":[0.6536,0.7338,0.7894,0.7385,0.7622,0.7161,0.7734,0.815,0.7734,0.7707,0.7973,0.7924],"baseline_reward_curve":[0.2524,0.395,0.4465,0.6928,0.2711,0.4154,0.7127,0.3202,0.3947,0.5094,0.285,0.3406],"gain_curve":[0.40119999999999995,0.3388,0.3429,0.045700000000000074,0.4911,0.30069999999999997,0.060699999999999976,0.49479999999999996,0.3787,0.2613000000000001,0.5123,0.4518],"cost_curve":[0.18642885,0.2100108,0.1779945,0.14223,0.1487187,0.1571436,0.1673007,0.1751697,0.1845987,0.1973775,0.2021127,0.21144465]},{"run_name":"codex-gpt-5.4","task":"blind_spectrum_monitoring","run_index":0,"reward":32.82769999999999,"baseline_reward":19.7601,"reference_reward":90.0,"gain":13.067599999999992,"normalized_reward":0.18603197653725123,"normalized_gain":0.1860424060968195,"cost_usd":3.1478285,"latency_seconds":11.084235,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3669,0.3599,0.3715,0.3204,0.2831,0.3398,0.3406,0.3804,0.3609,0.3416,0.4461,0.3274,0.3608,0.3847,0.3476,0.33,0.401,0.3929,0.3769,0.4048,0.4155,0.45,0.4246,0.4227,0.3876,0.3236,0.3198,0.3,0.3405,0.3225,0.3021,0.2773,0.3046,0.3181,0.3396,0.3189,0.414,0.4825,0.4723,0.4538,0.4824,0.432,0.4291,0.4509,0.4473,0.4355,0.4352,0.417,0.3501,0.3604,0.3522,0.3522,0.3312,0.2872,0.3007,0.3107,0.3012,0.3508,0.3513,0.2808,0.2696,0.2754,0.2798,0.3003,0.3016,0.3206,0.3333,0.3338,0.3863,0.3848,0.3738,0.3765,0.4431,0.4722,0.4426,0.5017,0.4252,0.4531,0.3702,0.3076,0.3477,0.4059,0.3797,0.3439,0.3262,0.3792,0.3525],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1405,0.1358,0.1587,0.09310000000000002,0.08810000000000001,0.1177,0.128,0.14,0.1324,0.12230000000000002,0.1978,0.13540000000000002,0.16340000000000002,0.1608,0.12060000000000001,0.12350000000000003,0.15360000000000001,0.19110000000000002,0.17500000000000002,0.1918,0.20719999999999997,0.22560000000000002,0.19129999999999997,0.21330000000000002,0.1771,0.09240000000000001,0.11259999999999998,0.1018,0.13200000000000003,0.11300000000000002,0.09939999999999999,0.05379999999999999,0.09069999999999998,0.1152,0.09820000000000001,0.12160000000000001,0.19369999999999998,0.2561,0.2797,0.21409999999999998,0.26080000000000003,0.2047,0.2017,0.22940000000000002,0.21639999999999998,0.2022,0.2065,0.19929999999999998,0.12860000000000002,0.1529,0.1395,0.12760000000000002,0.10599999999999998,0.0874,0.06460000000000002,0.11519999999999997,0.08560000000000001,0.1089,0.1399,0.06420000000000001,0.048600000000000004,0.07729999999999998,0.0643,0.0731,0.0464,0.11179999999999998,0.11209999999999998,0.0797,0.17239999999999997,0.13759999999999997,0.14350000000000002,0.1557,0.2054,0.23,0.2297,0.2529,0.22550000000000003,0.2452,0.15259999999999999,0.091,0.1376,0.1866,0.1793,0.14429999999999998,0.1245,0.13499999999999998,0.13029999999999997],"cost_curve":[0.0341075,0.008671,0.009933,0.011693,0.011359,0.011744,0.012969,0.0128505,0.0136975,0.0145995,0.015301,0.0155175,0.016027,0.016946,0.017895,0.018386,0.018582,0.018788,0.0187295,0.019166,0.0193225,0.020037,0.020191,0.021713,0.0221365,0.0228075,0.023156,0.023492,0.0242735,0.024604,0.0252225,0.0251405,0.0261695,0.026583,0.0269865,0.0274075,0.027816,0.028227,0.0286455,0.0291915,0.0294545,0.029723,0.0305645,0.0308175,0.0328635,0.033461,0.034671,0.0367885,0.037283,0.0373625,0.038158,0.0386505,0.0390155,0.0399355,0.042003,0.044387,0.045215,0.0462315,0.047415,0.048364,0.0486195,0.0484975,0.0497415,0.050037,0.050893,0.051417,0.0520405,0.052346,0.0525445,0.0533585,0.0538895,0.054023,0.0543835,0.053722,0.052576,0.051178,0.05012,0.0505205,0.051071,0.052419,0.052799,0.053477,0.053852,0.053142,0.053973,0.05385,0.054998,0.0547005,0.0550515,0.0561395]},{"run_name":"codex-gpt-5.4","task":"codebase_adaptation","run_index":0,"reward":7.45,"baseline_reward":8.25,"reference_reward":19.0,"gain":-0.7999999999999998,"normalized_reward":-0.2094240837696334,"normalized_gain":-0.07441860465116278,"cost_usd":3.7940515,"latency_seconds":6.205719,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.875,0.0,0.925,0.0,0.675,0.625,0.0,0.875,0.9,0.85,0.0,0.825,0.9,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.5,0.775,0.625,0.625,0.0,0.6,0.75,0.825,0.675,0.775,0.725,0.0,0.0,0.6,0.0,0.775],"gain_curve":[0.0,0.0,0.0,-0.5,0.09999999999999998,-0.625,0.30000000000000004,0.0,0.07500000000000007,-0.125,-0.825,0.19999999999999996,0.125,0.125,0.0,0.825,0.30000000000000004,0.0,-0.775],"cost_curve":[0.13536,0.079286,0.146514,0.070613,0.093658,0.1935425,0.0856105,0.089876,0.313545,0.449947,0.145734,0.2017805,0.165446,0.255624,0.3597635,0.351231,0.196626,0.203988,0.2559065]},{"run_name":"codex-gpt-5.4","task":"cohort_studies","run_index":0,"reward":0.8206,"baseline_reward":0.5048999999999999,"reference_reward":3.24404,"gain":0.3157000000000001,"normalized_reward":-0.07730469340398478,"normalized_gain":0.11525515307724327,"cost_usd":7.763108,"latency_seconds":13.096974,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.181,0.0,0.0656,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.2283,0.1218,0.0,0.1439],"baseline_reward_curve":[0.0,0.0,0.0663,0.1177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0285,0.0,0.0,0.0,0.0769,0.1142,0.002,0.0993],"gain_curve":[0.0,0.0,-0.0663,0.0633,0.0,0.0656,0.0,0.0,0.0,0.08,0.0,0.0,-0.0285,0.0,0.0,0.0,0.1514,0.0076000000000000095,-0.002,0.0446],"cost_curve":[0.1875045,0.210752,0.235293,0.2424795,0.275083,0.3061445,0.291697,0.316768,0.396725,0.366841,0.7871885,0.337495,0.424973,0.3916285,0.7341155,0.34988,0.4363215,0.7608835,0.3433685,0.3679665]},{"run_name":"codex-gpt-5.4","task":"database_exploration","run_index":0,"reward":9.600000000000001,"baseline_reward":3.466666666666667,"reference_reward":40.0,"gain":6.133333333333335,"normalized_reward":0.11798839458413933,"normalized_gain":0.16788321167883216,"cost_usd":1.8486595,"latency_seconds":4.744029,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.8,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.0,0.8,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.0,0.1333333333333333,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4],"gain_curve":[0.0,0.0,0.0,-0.2666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.33333333333333337,0.0,-0.1333333333333333,0.8666666666666667,0.0,-0.6,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.0,0.20000000000000007,0.5333333333333333],"cost_curve":[0.0838375,0.024591,0.0178905,0.0356565,0.0189455,0.0262025,0.0341625,0.0305815,0.0325005,0.0325915,0.045647,0.02525,0.114594,0.0378895,0.026054,0.027927,0.0287075,0.027313,0.028857,0.0593065,0.046621,0.063631,0.033055,0.083896,0.0346445,0.0343355,0.035739,0.0348855,0.1886125,0.039996,0.039614,0.0415425,0.061278,0.041378,0.0429345,0.042983,0.0437915,0.0436875,0.090752,0.0467775]},{"run_name":"codex-gpt-5.4","task":"exploitable_poker","run_index":0,"reward":85.0,"baseline_reward":64.5,"reference_reward":1138.5,"gain":20.5,"normalized_reward":-0.04836302119613892,"normalized_gain":0.019087523277467412,"cost_usd":8.271512,"latency_seconds":5.879567,"instance_count":120,"reward_curve":[-2.0,4.0,-3.0,20.0,-8.0,2.0,5.0,-8.0,20.0,-20.0,2.0,-7.0,4.0,5.0,-5.0,-8.0,8.0,-3.0,7.0,20.0,2.0,-3.0,-1.0,1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,1.0,-3.0,0.5,0.5,-2.0,2.0,1.0,10.0,-1.0,-2.0,-0.5,1.0,2.0,-7.0,10.0,-1.0,-0.5,2.0,5.0,4.0,-8.0,-7.0,4.0,16.0,4.0,-2.0,-5.0,-3.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-8.0,2.0,8.0,0.5,-7.0,-12.0,3.0,-2.0,0.5,-3.0,1.0,1.0,0.5,16.0,3.0,3.0,-4.0,1.0,3.0,1.0,1.0,0.5,1.0,-7.0,-3.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-1.0,-2.0,1.0,1.0,-1.0,0.5,-15.0,2.0,1.0,2.0,1.0,2.0,4.0,-1.0,2.0,18.0,3.0,0.5,-4.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,9.0,-3.0,1.0,4.0,-5.0,14.5,-10.5,4.0,-9.0,10.5,7.0,-2.0,-10.0,3.0,-5.0,13.0,8.5,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,2.0,-4.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-1.0,-1.0,-0.5,2.0,-8.0,2.0,-1.0,-0.5,2.0,2.0,2.0,-14.0,-7.0,4.0,8.0,2.0,-2.0,-2.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,2.0,1.0,-6.0,-0.5,18.0,0.5,-4.0,-4.0,1.0,-0.5,0.5,-3.0,4.0,-10.0,0.5,4.0,1.0,3.0,-2.0,1.0,18.0,1.0,-0.5,0.5,1.0,-8.0,-2.0,0.5,0.5,1.0,-1.0,2.0,1.0,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-11.0,2.0,2.0,2.0,2.0,-1.0,6.0,-1.0,2.0,18.0,2.0,0.5,-5.0],"gain_curve":[-1.0,4.5,-2.0,11.0,-5.0,1.0,1.0,-3.0,5.5,-9.5,-2.0,2.0,-6.5,-2.0,-3.0,2.0,5.0,2.0,-6.0,11.5,0.0,2.0,-0.5,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,-1.0,2.0,0.0,-2.0,0.0,-1.0,0.5,1.5,0.0,1.0,8.0,0.0,0.0,0.0,3.0,2.0,6.0,0.0,0.0,8.0,2.0,0.0,-3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-2.0,2.5,-10.0,0.0,-3.0,-8.0,2.0,-1.5,0.0,0.0,-3.0,11.0,0.0,12.0,2.0,0.0,-2.0,0.0,-15.0,0.0,1.5,0.0,0.0,1.0,-1.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,2.0,-3.0,0.0,-4.0,0.0,-1.0,0.0,-1.0,3.0,-2.0,0.0,0.0,0.0,1.0,0.0,1.0],"cost_curve":[0.049678,0.025137,0.027508,0.0298965,0.03245,0.034411,0.0361945,0.0385955,0.040859,0.0431725,0.045661,0.0477765,0.0489635,0.0508595,0.053774,0.056022,0.0577875,0.059921,0.062022,0.064248,0.049823,0.085042,0.0347795,0.035759,0.0358675,0.018326,0.0,0.018681,0.0,0.0374045,0.056865,0.0,0.0197155,0.0782455,0.0,0.0,0.060554,0.0611295,0.0417565,0.063511,0.0643475,0.065524,0.022069,0.044929,0.091325,0.1165205,0.071635,0.0484885,0.024391,0.073895,0.1012405,0.103247,0.104813,0.107247,0.109469,0.1120155,0.1138665,0.1154125,0.118107,0.120374,0.0611855,0.0,0.0,0.123259,0.062597,0.031624,0.063151,0.063585,0.0324485,0.129829,0.1321935,0.1340845,0.0,0.136593,0.1387965,0.070248,0.141371,0.0,0.144155,0.036548,0.0365105,0.0,0.147182,0.0747725,0.0748435,0.1515145,0.0382555,0.076631,0.077403,0.078022,0.0,0.078636,0.158754,0.160803,0.0,0.0,0.0816685,0.081622,0.123572,0.0831915,0.0,0.0834375,0.126328,0.0848115,0.042783,0.042748,0.0,0.216005,0.131252,0.044177,0.0886815,0.0888955,0.1343065,0.1357155,0.0910705,0.091967,0.1848615,0.093345,0.0,0.234764]},{"run_name":"codex-gpt-5.4","task":"sales_prediction","run_index":0,"reward":8.445400000000001,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain":3.2693000000000003,"normalized_reward":0.4448279632030239,"normalized_gain":0.4790955318805962,"cost_usd":2.648841,"latency_seconds":10.882319,"instance_count":12,"reward_curve":[0.3613,0.3723,0.3921,0.8025,0.774,0.8022,0.8059,0.8468,0.7975,0.8205,0.8364,0.8339],"baseline_reward_curve":[0.4398,0.3375,0.5167,0.5975,0.4706,0.4551,0.5788,0.4435,0.4208,0.1523,0.445,0.3185],"gain_curve":[-0.07850000000000001,0.0348,-0.12460000000000004,0.20499999999999996,0.3034,0.3471,0.22709999999999997,0.4033,0.3767,0.6682,0.3914,0.5154],"cost_curve":[0.1603165,0.180314,0.1881175,0.2221615,0.2254155,0.186689,0.199926,0.213228,0.2266025,0.3125595,0.2603605,0.2731505]},{"run_name":"codex-gpt-5.4","task":"sales_prediction","run_index":1,"reward":7.6736,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain":2.4974999999999996,"normalized_reward":0.32428506723725936,"normalized_gain":0.365993053825525,"cost_usd":1.5065575,"latency_seconds":11.989817,"instance_count":12,"reward_curve":[0.0,0.5958,0.6075,0.6314,0.6698,0.6977,0.7206,0.743,0.7224,0.7403,0.7745,0.7706],"baseline_reward_curve":[0.4398,0.3375,0.5167,0.5975,0.4706,0.4551,0.5788,0.4435,0.4208,0.1523,0.445,0.3185],"gain_curve":[-0.4398,0.2583,0.09079999999999999,0.03389999999999993,0.19919999999999993,0.24259999999999998,0.14180000000000004,0.2995,0.30160000000000003,0.588,0.32949999999999996,0.45209999999999995],"cost_curve":[0.1590135,0.1463655,0.110339,0.1384895,0.096624,0.103345,0.110338,0.116518,0.1217285,0.1285525,0.134385,0.140859]},{"run_name":"codex-gpt-5.4","task":"sales_prediction","run_index":2,"reward":8.1118,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain":2.9356999999999998,"normalized_reward":0.39272494416418086,"normalized_gain":0.43020853177801555,"cost_usd":1.7339165,"latency_seconds":11.498106,"instance_count":12,"reward_curve":[0.1464,0.6132,0.7654,0.7971,0.7678,0.7524,0.7357,0.7256,0.7031,0.6976,0.703,0.7045],"baseline_reward_curve":[0.4398,0.3375,0.5167,0.5975,0.4706,0.4551,0.5788,0.4435,0.4208,0.1523,0.445,0.3185],"gain_curve":[-0.2934,0.27569999999999995,0.24869999999999992,0.1996,0.2972,0.29729999999999995,0.15690000000000004,0.2821,0.28229999999999994,0.5453,0.25799999999999995,0.386],"cost_curve":[0.191511,0.1287735,0.147078,0.1217925,0.1331215,0.166304,0.115434,0.120142,0.1251495,0.1982815,0.1404445,0.1458845]},{"run_name":"codex-gpt-5.4","task":"sales_prediction","run_index":3,"reward":8.8275,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain":3.6513999999999998,"normalized_reward":0.5045059115685572,"normalized_gain":0.5350899046000088,"cost_usd":2.73783,"latency_seconds":10.28029,"instance_count":12,"reward_curve":[0.3855,0.5954,0.8276,0.7483,0.7595,0.7587,0.7861,0.8197,0.7946,0.7616,0.8077,0.7828],"baseline_reward_curve":[0.4398,0.3375,0.5167,0.5975,0.4706,0.4551,0.5788,0.4435,0.4208,0.1523,0.445,0.3185],"gain_curve":[-0.054300000000000015,0.2579,0.31089999999999995,0.15079999999999993,0.28889999999999993,0.30360000000000004,0.20730000000000004,0.3762,0.37379999999999997,0.6093000000000001,0.36269999999999997,0.46430000000000005],"cost_curve":[0.173304,0.150125,0.208705,0.2207725,0.212735,0.204476,0.188297,0.2672835,0.219371,0.231754,0.3191075,0.3418995]},{"run_name":"codex-gpt-5.4","task":"sales_prediction","run_index":4,"reward":8.5414,"baseline_reward":5.176100000000001,"reference_reward":12.0,"gain":3.3652999999999986,"normalized_reward":0.4598216377465756,"normalized_gain":0.4931637333489645,"cost_usd":3.2891855,"latency_seconds":10.490402,"instance_count":12,"reward_curve":[0.6283,0.6079,0.6942,0.7089,0.7234,0.7269,0.7282,0.7574,0.7317,0.7364,0.7567,0.7414],"baseline_reward_curve":[0.4398,0.3375,0.5167,0.5975,0.4706,0.4551,0.5788,0.4435,0.4208,0.1523,0.445,0.3185],"gain_curve":[0.18849999999999995,0.2704,0.1775,0.11139999999999994,0.2528,0.2718,0.14939999999999998,0.31389999999999996,0.3109,0.5841000000000001,0.31170000000000003,0.42289999999999994],"cost_curve":[0.248272,0.1817225,0.197684,0.203032,0.1996905,0.2201805,0.240437,0.2616955,0.3556805,0.389212,0.341186,0.450393]},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","run_index":0,"reward":35.1646,"baseline_reward":19.7597,"reference_reward":90.0,"gain":15.404900000000001,"normalized_reward":0.2193026666097182,"normalized_gain":0.21931711567291143,"cost_usd":8.73162775,"latency_seconds":16.064317,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3552,0.3601,0.3641,0.3454,0.3364,0.3458,0.3456,0.3693,0.3706,0.3639,0.4706,0.3481,0.4069,0.4081,0.4136,0.4236,0.5305,0.5045,0.4699,0.4699,0.4699,0.5072,0.5072,0.4835,0.4541,0.4544,0.4199,0.3726,0.3726,0.3387,0.3394,0.3273,0.3653,0.3758,0.4047,0.3848,0.3876,0.4266,0.3992,0.4143,0.3926,0.3714,0.4467,0.4699,0.5028,0.4728,0.465,0.4671,0.4231,0.4266,0.4205,0.4268,0.4235,0.4103,0.4089,0.3939,0.389,0.4139,0.4111,0.3614,0.3547,0.3519,0.3299,0.3201,0.3378,0.3363,0.3305,0.3213,0.3942,0.4058,0.4264,0.403,0.3952,0.4374,0.435,0.443,0.442,0.4234,0.3259,0.2722,0.2653,0.304,0.2986,0.2833,0.2874,0.3524,0.3275],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.12880000000000003,0.13599999999999998,0.1513,0.11809999999999998,0.14139999999999997,0.1237,0.133,0.12890000000000001,0.14209999999999998,0.1446,0.22230000000000003,0.15610000000000002,0.2095,0.18420000000000003,0.18660000000000002,0.2171,0.28309999999999996,0.30269999999999997,0.268,0.2569,0.26159999999999994,0.2828,0.2739,0.2741,0.2436,0.22320000000000004,0.2127,0.1744,0.1641,0.1292,0.1367,0.10379999999999998,0.1514,0.17290000000000003,0.1633,0.18749999999999997,0.1673,0.2002,0.2066,0.1746,0.171,0.1441,0.2193,0.24839999999999998,0.27190000000000003,0.2395,0.23630000000000004,0.2494,0.20159999999999997,0.2191,0.20779999999999998,0.20220000000000002,0.19829999999999998,0.2105,0.17279999999999998,0.19839999999999997,0.1734,0.172,0.19970000000000002,0.1448,0.1337,0.1538,0.11440000000000003,0.09289999999999998,0.0826,0.12749999999999997,0.10930000000000001,0.06719999999999998,0.1807,0.1586,0.1961,0.18220000000000003,0.1575,0.1952,0.2221,0.1942,0.24230000000000002,0.2155,0.10830000000000004,0.05560000000000001,0.05519999999999997,0.0847,0.09819999999999998,0.0837,0.0857,0.10819999999999999,0.1053],"cost_curve":[0.010755,0.020595,0.0323875,0.03137575,0.03306675,0.0318295,0.0343985,0.03571225,0.0369415,0.0399025,0.04207175,0.04293325,0.04344425,0.04608175,0.050201,0.05313625,0.0541875,0.0543335,0.05496725,0.05590125,0.0566475,0.0574365,0.05854775,0.061467,0.06420075,0.0653185,0.06696675,0.0683065,0.06972775,0.072673,0.07662025,0.07722575,0.0784505,0.0792435,0.08011475,0.08124875,0.08341875,0.08483775,0.0860165,0.08749525,0.08907225,0.08953725,0.090537,0.0925795,0.09446125,0.0958395,0.09664275,0.1003305,0.1020685,0.1022975,0.10255,0.1041415,0.10495,0.1061155,0.109521,0.11361375,0.114931,0.114583,0.11650475,0.118346,0.121828,0.12352475,0.124045,0.1254325,0.12525275,0.1261995,0.1279935,0.1303325,0.13395425,0.13585025,0.13824875,0.141968,0.145571,0.147609,0.1476155,0.14968725,0.15145525,0.152175,0.152517,0.1541305,0.155493,0.15688375,0.15934825,0.160299,0.16477775,0.1678565,0.1698455,0.17249125,0.17336625,0.177068]},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","run_index":1,"reward":28.62509999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":8.86539999999999,"normalized_reward":0.12619912014692672,"normalized_gain":0.1262152923606532,"cost_usd":7.54093475,"latency_seconds":12.574142,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2454,0.2647,0.2786,0.2675,0.2719,0.3278,0.3249,0.345,0.3985,0.3892,0.3813,0.3918,0.3853,0.3835,0.3937,0.3682,0.3859,0.4258,0.3896,0.4068,0.4077,0.4036,0.3018,0.2848,0.3038,0.3064,0.2649,0.3116,0.3086,0.3147,0.3065,0.3333,0.3285,0.3529,0.3554,0.3389,0.3441,0.3188,0.3516,0.3051,0.3266,0.3223,0.3609,0.3444,0.3251,0.2884,0.3169,0.257,0.2506,0.2621,0.2882,0.3021,0.3753,0.3989,0.346,0.3149,0.3251,0.3238,0.2927,0.2922,0.272,0.2744,0.2926,0.2865,0.2963,0.2814,0.293,0.2779,0.2614,0.2989,0.3047,0.319,0.2807,0.2664,0.2841,0.2668,0.3009,0.2892,0.3928,0.4304,0.427,0.3134,0.2975,0.2517,0.275,0.2609,0.25,0.2666],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03370000000000001,0.0383,0.05450000000000002,0.054700000000000026,0.04459999999999997,0.13279999999999997,0.10280000000000003,0.13239999999999996,0.15810000000000002,0.16069999999999998,0.16199999999999998,0.1435,0.19329999999999997,0.18610000000000002,0.1698,0.14120000000000002,0.17940000000000003,0.1784,0.1878,0.2049,0.1947,0.1953,0.07740000000000002,0.05149999999999999,0.09440000000000001,0.09590000000000001,0.033700000000000035,0.10439999999999999,0.1104,0.10619999999999999,0.097,0.1306,0.10500000000000001,0.13899999999999998,0.1525,0.09749999999999998,0.1468,0.09849999999999998,0.12520000000000003,0.11249999999999999,0.0869,0.10069999999999998,0.1336,0.11699999999999999,0.1036,0.057499999999999996,0.08360000000000001,0.02830000000000002,0.032899999999999985,0.0406,0.08070000000000002,0.08939999999999998,0.15070000000000003,0.17369999999999997,0.14619999999999997,0.07880000000000001,0.1296,0.10819999999999996,0.05080000000000001,0.08080000000000001,0.05540000000000003,0.053399999999999975,0.09450000000000003,0.07099999999999998,0.0691,0.0262,0.08419999999999997,0.05669999999999997,0.007300000000000029,0.0854,0.05750000000000002,0.0887,0.05990000000000001,0.02870000000000003,0.04190000000000002,0.053899999999999976,0.05210000000000001,0.08950000000000002,0.18489999999999998,0.21280000000000002,0.2104,0.1033,0.07819999999999999,0.051299999999999985,0.07540000000000002,0.05920000000000003,0.0058,0.044399999999999995],"cost_curve":[0.014975,0.02544,0.03840375,0.03228575,0.03223825,0.03203725,0.0327705,0.03541175,0.03692825,0.03879525,0.04242875,0.04500775,0.045,0.0450585,0.0470055,0.0492405,0.049838,0.0499995,0.05371275,0.055818,0.057008,0.0603585,0.06188175,0.06131475,0.06363625,0.06501375,0.0659805,0.0668645,0.0672415,0.06908475,0.070537,0.07170525,0.072345,0.07299875,0.07358,0.0749535,0.07644725,0.0775085,0.07958625,0.0823385,0.083243,0.083161,0.08348675,0.084875,0.08648225,0.08634675,0.0869205,0.088462,0.089847,0.0916865,0.0925345,0.09271375,0.093382,0.0947015,0.096976,0.09780775,0.09840825,0.09985675,0.1012895,0.1021635,0.10167775,0.101859,0.10334775,0.10424075,0.10546425,0.1071805,0.10883875,0.110476,0.111233,0.11300425,0.114006,0.11345125,0.1144365,0.1170045,0.1183225,0.118157,0.11912775,0.120881,0.1222215,0.1210165,0.121671,0.123178,0.12421125,0.12524,0.12630225,0.1280875,0.12930175,0.1287635,0.12961675,0.13149275]},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","run_index":2,"reward":39.492000000000026,"baseline_reward":19.7597,"reference_reward":90.0,"gain":19.732300000000027,"normalized_reward":0.280912313671892,"normalized_gain":0.2809256224702916,"cost_usd":6.65906675,"latency_seconds":11.151956,"instance_count":90,"reward_curve":[0.2482,0.2781,0.3113,0.2771,0.3255,0.3061,0.3717,0.3841,0.388,0.4128,0.46,0.4537,0.462,0.4378,0.4387,0.4839,0.4732,0.4512,0.4348,0.4327,0.4193,0.4137,0.4008,0.4008,0.4678,0.3899,0.3899,0.3785,0.3946,0.428,0.445,0.475,0.4561,0.4561,0.3697,0.3665,0.4348,0.423,0.4131,0.4298,0.4298,0.3754,0.4491,0.4218,0.4218,0.4152,0.4325,0.4498,0.4178,0.4178,0.4241,0.4277,0.4743,0.4422,0.471,0.456,0.4651,0.4401,0.4457,0.4457,0.4401,0.4401,0.4277,0.4868,0.4684,0.4684,0.5177,0.5132,0.5132,0.5239,0.5239,0.4822,0.4889,0.4889,0.4751,0.4353,0.4353,0.4851,0.4851,0.4851,0.4851,0.4851,0.5065,0.5065,0.5065,0.4851,0.4993,0.5064,0.5064,0.5145],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.02990000000000001,0.09960000000000002,0.05070000000000002,0.10140000000000002,0.0933,0.14439999999999997,0.1891,0.16590000000000002,0.2002,0.21960000000000002,0.22519999999999998,0.24270000000000003,0.18950000000000003,0.24669999999999997,0.2865,0.24930000000000002,0.22419999999999998,0.22830000000000003,0.18529999999999996,0.2175,0.21180000000000002,0.1878,0.19249999999999998,0.2434,0.15660000000000002,0.18050000000000002,0.168,0.16340000000000002,0.2208,0.24680000000000002,0.26649999999999996,0.2466,0.2534,0.14619999999999997,0.15259999999999999,0.23190000000000002,0.18159999999999998,0.21580000000000002,0.20950000000000002,0.20340000000000003,0.18280000000000002,0.2094,0.20020000000000002,0.1945,0.18780000000000002,0.211,0.21889999999999998,0.1845,0.18910000000000002,0.20639999999999997,0.20620000000000002,0.26680000000000004,0.22949999999999998,0.24639999999999998,0.2308,0.2653,0.204,0.2502,0.23009999999999997,0.1982,0.2287,0.21110000000000004,0.26580000000000004,0.2703,0.2529,0.29050000000000004,0.258,0.3044,0.3027,0.26980000000000004,0.26870000000000005,0.2417,0.2586,0.2543,0.19760000000000003,0.19310000000000002,0.2722,0.23629999999999998,0.2854,0.2772,0.26749999999999996,0.28989999999999994,0.29639999999999994,0.28719999999999996,0.28469999999999995,0.2997,0.30469999999999997,0.2622,0.29229999999999995],"cost_curve":[0.01216,0.02015,0.03325625,0.029024,0.03381375,0.03517475,0.03626175,0.04026075,0.041854,0.04232725,0.04227375,0.04205725,0.042556,0.04484925,0.0484895,0.05244575,0.0536645,0.05341575,0.054642,0.05751325,0.058471,0.05935025,0.06005575,0.060587,0.05533225,0.0571565,0.05767075,0.05691,0.05861875,0.0598715,0.060914,0.061051,0.062126,0.0645585,0.064293,0.0637255,0.06462225,0.06623175,0.06806375,0.06899125,0.069169,0.06926175,0.07050225,0.07250775,0.073248,0.0745505,0.07410475,0.074558,0.075571,0.0755635,0.077128,0.07855275,0.0804885,0.0820745,0.08139075,0.08221625,0.0839165,0.0846015,0.085374,0.0864135,0.08682975,0.0882155,0.0897355,0.0911855,0.0907355,0.09097975,0.09382975,0.095014,0.0959025,0.096861,0.0989105,0.1001485,0.09983775,0.1007445,0.10237475,0.10348475,0.1036145,0.10450525,0.1061325,0.107297,0.1070585,0.10790575,0.10883575,0.10934,0.1106995,0.1120565,0.11293225,0.11418725,0.11594625,0.11778325]},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","run_index":3,"reward":40.182799999999986,"baseline_reward":19.7597,"reference_reward":90.0,"gain":20.423099999999987,"normalized_reward":0.2907473056279272,"normalized_gain":0.29076043240134203,"cost_usd":6.64583025,"latency_seconds":11.028103,"instance_count":90,"reward_curve":[0.192,0.2393,0.2286,0.2412,0.2377,0.2377,0.2873,0.3403,0.346,0.3457,0.3513,0.3489,0.3474,0.3474,0.3474,0.3474,0.3523,0.3596,0.3552,0.346,0.349,0.3627,0.3719,0.3793,0.3689,0.3689,0.4426,0.4765,0.4824,0.4824,0.4831,0.4831,0.5251,0.5251,0.5122,0.5059,0.5059,0.5133,0.5234,0.5033,0.468,0.5052,0.5052,0.5052,0.5107,0.5007,0.5007,0.5007,0.5007,0.4718,0.4779,0.4779,0.504,0.504,0.504,0.504,0.504,0.504,0.504,0.504,0.504,0.504,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948,0.4948],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.0169,0.014800000000000008,0.013600000000000001,0.024900000000000005,0.06,0.14529999999999998,0.12389999999999998,0.1331,0.1109,0.12039999999999998,0.1281,0.0991,0.15539999999999998,0.15,0.12840000000000001,0.13259999999999997,0.14870000000000003,0.09859999999999997,0.14719999999999997,0.16080000000000003,0.1589,0.171,0.14450000000000002,0.1356,0.2332,0.266,0.2512,0.2752,0.2849,0.27459999999999996,0.3156,0.3224,0.28869999999999996,0.29200000000000004,0.30300000000000005,0.2719,0.32609999999999995,0.283,0.24160000000000004,0.3126,0.26549999999999996,0.28359999999999996,0.28340000000000004,0.27330000000000004,0.2792,0.26980000000000004,0.2674,0.2431,0.2602,0.25639999999999996,0.2965,0.2913,0.2794,0.2788,0.3042,0.2679,0.3085,0.2884,0.2621,0.29259999999999997,0.2782,0.27380000000000004,0.2967,0.2793,0.2676,0.23960000000000004,0.28600000000000003,0.2736,0.24070000000000003,0.2813,0.24760000000000001,0.2645,0.274,0.2571,0.25260000000000005,0.28190000000000004,0.24600000000000002,0.29510000000000003,0.28690000000000004,0.2772,0.2782,0.2847,0.2755,0.2944,0.2952,0.2931,0.25060000000000004,0.2726],"cost_curve":[0.013545,0.023805,0.036415,0.0292075,0.029707,0.03054225,0.0320115,0.03443175,0.0349315,0.03620675,0.036338,0.03704,0.038687,0.03914175,0.04028075,0.041613,0.04164175,0.04381775,0.0486435,0.04914025,0.048591,0.05130725,0.05212325,0.05402075,0.05557125,0.05653825,0.05823675,0.06011175,0.0607915,0.060942,0.061851,0.06366325,0.064391,0.0651715,0.067281,0.0676805,0.067445,0.0687145,0.070057,0.07073975,0.0702335,0.07165125,0.0742165,0.0749885,0.0750955,0.07615625,0.076291,0.07680675,0.07770175,0.0782445,0.0791095,0.08082725,0.08181575,0.08305325,0.08377225,0.083393,0.084676,0.08638975,0.08703725,0.0874695,0.0891705,0.0905245,0.09148375,0.092441,0.09266525,0.09308375,0.0948395,0.096945,0.09809075,0.098936,0.0999665,0.100099,0.10222825,0.1034415,0.10316175,0.1041455,0.1057,0.1064615,0.10663725,0.108037,0.10937775,0.1104205,0.1108905,0.112148,0.11318375,0.11447925,0.11562325,0.1157515,0.11631975,0.1182735]},{"run_name":"icl-claude-opus-4.7","task":"blind_spectrum_monitoring","run_index":4,"reward":24.396799999999995,"baseline_reward":19.7597,"reference_reward":90.0,"gain":4.637099999999997,"normalized_reward":0.0660003701647232,"normalized_gain":0.06601765653051021,"cost_usd":8.303322,"latency_seconds":14.524732,"instance_count":90,"reward_curve":[0.2273,0.2437,0.3124,0.3354,0.2749,0.34,0.3393,0.3393,0.3235,0.3737,0.3686,0.3466,0.36,0.36,0.3411,0.3136,0.29,0.2684,0.2554,0.2889,0.3029,0.3071,0.292,0.292,0.2836,0.2836,0.2941,0.2941,0.2941,0.2902,0.3027,0.2325,0.215,0.2335,0.2275,0.2576,0.2576,0.3028,0.3039,0.3473,0.3473,0.3193,0.2884,0.2884,0.3216,0.3114,0.3042,0.2869,0.3013,0.2927,0.264,0.2559,0.2559,0.246,0.2347,0.2373,0.2517,0.2524,0.2524,0.2585,0.2295,0.2456,0.2935,0.298,0.2953,0.23,0.2242,0.2242,0.2125,0.176,0.1676,0.1638,0.1941,0.1759,0.2229,0.287,0.2212,0.2519,0.2367,0.2288,0.2076,0.2143,0.253,0.2252,0.2271,0.2389,0.2389,0.2368,0.25,0.2398],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,-0.004500000000000004,0.10070000000000001,0.10899999999999999,0.050799999999999984,0.12720000000000004,0.11199999999999999,0.14429999999999998,0.10140000000000002,0.16109999999999997,0.12819999999999998,0.11810000000000001,0.1407,0.1117,0.1491,0.1162,0.06609999999999999,0.04140000000000002,0.04890000000000003,0.04149999999999998,0.1011,0.10519999999999999,0.07899999999999999,0.08369999999999997,0.05920000000000003,0.05030000000000001,0.08469999999999997,0.08359999999999998,0.06289999999999998,0.08300000000000002,0.10450000000000004,0.02400000000000002,0.005500000000000005,0.030800000000000022,0.0040000000000000036,0.04369999999999999,0.0547,0.06140000000000001,0.1066,0.127,0.12090000000000001,0.12669999999999998,0.04869999999999999,0.0668,0.0943,0.08400000000000002,0.08270000000000002,0.055999999999999994,0.068,0.06400000000000003,0.04630000000000001,0.034400000000000014,0.048400000000000026,0.033299999999999996,0.010099999999999998,0.0121,0.051899999999999974,0.01630000000000001,0.056900000000000006,0.042899999999999994,-0.012399999999999994,0.03420000000000001,0.0769,0.07699999999999999,0.09720000000000001,0.014500000000000013,-0.0030000000000000027,-0.030999999999999972,0.003699999999999981,-0.04520000000000002,-0.0865,-0.049699999999999994,-0.05310000000000001,-0.054400000000000004,0.0020999999999999908,0.04929999999999998,-0.02099999999999999,0.03900000000000001,-0.0121,0.029100000000000015,-0.0002999999999999947,-0.0032999999999999974,0.036400000000000016,0.015100000000000002,0.007800000000000001,0.038500000000000006,0.0393,0.03510000000000002,0.0058,0.017600000000000005],"cost_curve":[0.013635,0.021835,0.0364025,0.034581,0.03870925,0.03995275,0.04178725,0.04233075,0.0441645,0.0480025,0.0491155,0.04916775,0.05053425,0.05080725,0.051646,0.05524025,0.05634875,0.05648375,0.057938,0.06039475,0.061386,0.061878,0.062662,0.062814,0.064451,0.06594225,0.06693575,0.068059,0.06902225,0.07085875,0.07307,0.07393875,0.0736505,0.07440375,0.07560975,0.0776575,0.0788865,0.08087325,0.0823605,0.08291075,0.08368175,0.0867565,0.089939,0.0899715,0.0915315,0.0930525,0.09429725,0.09468325,0.095756,0.09693025,0.09726025,0.101077,0.1029295,0.1032365,0.105135,0.1063395,0.108181,0.109956,0.11001425,0.11034575,0.111708,0.1137405,0.11458825,0.1160915,0.1167855,0.1182665,0.120421,0.12145025,0.1217245,0.12332475,0.123787,0.12477575,0.12849875,0.130131,0.13198375,0.13352025,0.1344175,0.1384935,0.14110725,0.143855,0.144835,0.144024,0.144804,0.14610025,0.147408,0.14866475,0.150647,0.15316025,0.1583185,0.1591685]},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","run_index":0,"reward":11.2,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain":2.3249999999999975,"normalized_reward":0.18324607329842932,"normalized_gain":0.22962962962962943,"cost_usd":8.40077725,"latency_seconds":5.098843,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.825,0.625,0.925,0.0,0.85,0.675,0.925,0.775,0.875,0.9,0.625,0.875,0.85,0.625,0.85],"baseline_reward_curve":[0.0,0.0,0.0,0.7,0.6,0.425,0.65,0.0,0.6,0.35,0.9,0.625,0.925,0.775,0.275,0.375,0.725,0.15,0.8],"gain_curve":[0.0,0.0,0.0,-0.7,0.22499999999999998,0.2,0.275,0.0,0.25,0.32500000000000007,0.025000000000000022,0.15000000000000002,-0.050000000000000044,0.125,0.35,0.5,0.125,0.475,0.04999999999999993],"cost_curve":[0.16224075,0.1483,0.11806925,0.1616575,0.16842575,0.57400675,0.168019,0.26495575,0.28910875,0.752256,0.2067505,0.58115775,0.318158,0.32942175,1.18457425,0.59075275,0.51967625,1.269436,0.5938105]},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","run_index":1,"reward":9.55,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain":0.6749999999999989,"normalized_reward":0.010471204188481823,"normalized_gain":0.06666666666666657,"cost_usd":8.670886,"latency_seconds":4.82906,"instance_count":19,"reward_curve":[0.7,0.55,0.675,0.0,0.0,0.0,0.775,0.8,0.0,0.0,0.95,0.0,0.775,0.0,0.95,0.925,0.7,0.9,0.85],"baseline_reward_curve":[0.0,0.0,0.0,0.7,0.6,0.425,0.65,0.0,0.6,0.35,0.9,0.625,0.925,0.775,0.275,0.375,0.725,0.15,0.8],"gain_curve":[0.7,0.55,0.675,-0.7,-0.6,-0.425,0.125,0.8,-0.6,-0.35,0.04999999999999993,-0.625,-0.15000000000000002,-0.775,0.6749999999999999,0.55,-0.025000000000000022,0.75,0.04999999999999993],"cost_curve":[0.2251845,0.3507815,0.36126375,0.84051075,0.31763775,0.2954155,0.492749,0.4099475,0.244154,0.3666255,0.17521275,0.7953905,0.69002175,0.5462545,0.21492225,0.311927,1.0543675,0.41779975,0.56072025]},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","run_index":2,"reward":7.175000000000002,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain":-1.7000000000000002,"normalized_reward":-0.23821989528795787,"normalized_gain":-0.1679012345679013,"cost_usd":7.51035675,"latency_seconds":4.378976,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.925,0.0,0.0,0.55,0.925,0.775,0.875,0.825,0.75,0.65,0.9],"baseline_reward_curve":[0.0,0.0,0.0,0.7,0.6,0.425,0.65,0.0,0.6,0.35,0.9,0.625,0.925,0.775,0.275,0.375,0.725,0.15,0.8],"gain_curve":[0.0,0.0,0.0,-0.7,-0.6,-0.425,-0.65,0.0,0.32500000000000007,-0.35,-0.9,-0.07499999999999996,0.0,0.0,0.6,0.44999999999999996,0.025000000000000022,0.5,0.09999999999999998],"cost_curve":[0.10374,0.09254475,0.1261265,0.097956,0.14327075,0.113073,0.1453015,0.1219805,0.1502275,0.36511325,0.98596775,1.0449185,0.21898225,0.65345875,0.3836215,0.49980125,0.7470635,1.1346145,0.382595]},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","run_index":3,"reward":11.4,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain":2.5249999999999986,"normalized_reward":0.20418848167539277,"normalized_gain":0.2493827160493826,"cost_usd":12.29591975,"latency_seconds":5.167206,"instance_count":19,"reward_curve":[0.0,0.65,0.0,0.9,0.0,0.0,0.625,0.9,0.775,0.875,0.825,0.9,0.325,0.85,0.75,0.525,0.775,0.775,0.95],"baseline_reward_curve":[0.0,0.0,0.0,0.7,0.6,0.425,0.65,0.0,0.6,0.35,0.9,0.625,0.925,0.775,0.275,0.375,0.725,0.15,0.8],"gain_curve":[0.0,0.65,0.0,0.20000000000000007,-0.6,-0.425,-0.025000000000000022,0.9,0.17500000000000004,0.525,-0.07500000000000007,0.275,-0.6000000000000001,0.07499999999999996,0.475,0.15000000000000002,0.050000000000000044,0.625,0.1499999999999999],"cost_curve":[0.30331925,0.31931625,0.2428685,0.219414,0.38514725,0.81151025,0.68140525,0.27000225,0.4864715,0.341882,0.519724,0.320779,1.88469175,0.63000525,0.929451,1.72999075,0.969663,0.9458025,0.304476]},{"run_name":"icl-claude-opus-4.7","task":"codebase_adaptation","run_index":4,"reward":12.474999999999996,"baseline_reward":8.875000000000002,"reference_reward":19.0,"gain":3.5999999999999943,"normalized_reward":0.3167539267015703,"normalized_gain":0.35555555555555507,"cost_usd":3.289872,"latency_seconds":4.220629,"instance_count":19,"reward_curve":[0.0,0.625,0.0,0.6,0.9,0.925,0.0,0.95,0.0,0.925,0.0,0.925,0.925,0.925,0.975,0.95,0.95,0.95,0.95],"baseline_reward_curve":[0.0,0.0,0.0,0.7,0.6,0.425,0.65,0.0,0.6,0.35,0.9,0.625,0.925,0.775,0.275,0.375,0.725,0.15,0.8],"gain_curve":[0.0,0.625,0.0,-0.09999999999999998,0.30000000000000004,0.5,-0.65,0.95,-0.6,0.5750000000000001,-0.9,0.30000000000000004,0.0,0.15000000000000002,0.7,0.575,0.22499999999999998,0.7999999999999999,0.1499999999999999],"cost_curve":[0.06663225,0.23969575,0.21159575,0.454786,0.14736875,0.1148965,0.3536405,0.11133075,0.136977,0.16933375,0.08626375,0.17282975,0.16491525,0.1660925,0.1051375,0.1385535,0.15574225,0.1464895,0.147591]},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","run_index":0,"reward":-0.20026300000000002,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain":-0.17042300000000002,"normalized_reward":-0.5311143611582813,"normalized_gain":-0.0520553593900815,"cost_usd":7.017181,"latency_seconds":8.294877,"instance_count":20,"reward_curve":[-0.042118,-0.002904,-0.022025,-0.022413,-0.008593,0.001443,0.007365,-0.016005,-0.01427,0.037014,-0.002352,-0.004839,-0.130643,-0.074839,-0.129888,-0.074439,0.063564,0.08287,0.075115,0.077694],"baseline_reward_curve":[-0.029102,0.002496,-0.025936,0.003304,-0.012534,-0.002029,0.029004,-0.011323,-0.039821,0.000789,-0.013915,-0.009497,-0.029088,0.001071,-0.018565,0.008568,0.048265,0.021601,0.045961,0.000911],"gain_curve":[-0.013016000000000003,-0.0054,0.003911000000000001,-0.025717,0.003941,0.0034720000000000003,-0.021639,-0.004681999999999999,0.025551000000000004,0.036225,0.011563,0.004658,-0.101555,-0.07591,-0.111323,-0.083007,0.015298999999999993,0.061269000000000004,0.029154,0.076783],"cost_curve":[0.4296275,0.2530145,0.23911925,0.24910275,0.56198075,0.3654685,0.32142775,0.2799095,0.5347145,0.3037785,0.31423925,0.33726525,0.52800375,0.34925775,0.2610315,0.263866,0.50715075,0.3578775,0.278633,0.28171275]},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","run_index":1,"reward":0.035276000000000016,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain":0.06511600000000003,"normalized_reward":-0.4264089547196315,"normalized_gain":0.019889550014050617,"cost_usd":6.97919425,"latency_seconds":8.020582,"instance_count":20,"reward_curve":[0.001875,0.000952,0.00808,-0.027008,0.019343,-0.011957,0.00971,-0.005186,-0.017005,0.025423,-0.034973,-0.022324,0.00583,-0.013267,0.043335,-0.02114,0.106462,0.045666,-0.003936,-0.074604],"baseline_reward_curve":[-0.029102,0.002496,-0.025936,0.003304,-0.012534,-0.002029,0.029004,-0.011323,-0.039821,0.000789,-0.013915,-0.009497,-0.029088,0.001071,-0.018565,0.008568,0.048265,0.021601,0.045961,0.000911],"gain_curve":[0.030976999999999998,-0.0015439999999999998,0.034016000000000005,-0.030312000000000002,0.031877,-0.009928000000000001,-0.019294,0.006137,0.022816000000000003,0.024634,-0.021057999999999997,-0.012827,0.034918,-0.014338,0.0619,-0.029708,0.058197,0.024065,-0.049897000000000004,-0.075515],"cost_curve":[0.33860375,0.277585,0.23645225,0.214874,0.52003375,0.25249725,0.25810775,0.27697825,0.4956595,0.29651625,0.30227125,0.31088725,0.5120925,0.325161,0.33472125,0.34125575,0.603585,0.35347225,0.3607115,0.36772875]},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","run_index":2,"reward":-0.04251100000000002,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain":-0.012671000000000005,"normalized_reward":-0.46098802421828455,"normalized_gain":-0.0038703312277786617,"cost_usd":6.43899925,"latency_seconds":7.102087,"instance_count":20,"reward_curve":[0.015344,-0.019651,-0.005854,-0.000662,0.01291,-0.001199,-0.008474,0.011487,0.017697,-0.008546,-0.011109,-0.013956,0.019122,-0.003784,-0.03586,-0.057036,0.006798,0.01669,0.010412,0.01316],"baseline_reward_curve":[-0.029102,0.002496,-0.025936,0.003304,-0.012534,-0.002029,0.029004,-0.011323,-0.039821,0.000789,-0.013915,-0.009497,-0.029088,0.001071,-0.018565,0.008568,0.048265,0.021601,0.045961,0.000911],"gain_curve":[0.044446,-0.022147,0.020082000000000003,-0.003966,0.025444,0.00083,-0.037478,0.02281,0.057518,-0.009335,0.0028059999999999995,-0.004458999999999999,0.04821,-0.004855,-0.017295,-0.065604,-0.041467000000000004,-0.004910999999999999,-0.035549000000000004,0.012249],"cost_curve":[0.37936,0.2209035,0.20405375,0.21043825,0.564997,0.24043375,0.246946,0.25308775,0.56563625,0.27827925,0.284607,0.2910355,0.3553035,0.31278475,0.31097875,0.316761,0.390287,0.33100525,0.337814,0.344287]},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","run_index":3,"reward":0.049398000000000004,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain":0.07923800000000002,"normalized_reward":-0.420131226828596,"normalized_gain":0.02420308624628881,"cost_usd":6.6749835,"latency_seconds":8.274572,"instance_count":20,"reward_curve":[-0.022022,-0.011144,-0.002154,-0.019293,-0.01157,0.010718,0.007508,0.018221,0.003516,-0.016304,0.010461,-0.011362,-0.038303,-0.013615,-0.032682,-0.005813,0.051849,0.072636,0.064807,-0.006056],"baseline_reward_curve":[-0.029102,0.002496,-0.025936,0.003304,-0.012534,-0.002029,0.029004,-0.011323,-0.039821,0.000789,-0.013915,-0.009497,-0.029088,0.001071,-0.018565,0.008568,0.048265,0.021601,0.045961,0.000911],"gain_curve":[0.0070799999999999995,-0.01364,0.023782,-0.022597000000000003,0.0009639999999999996,0.012747,-0.021495999999999998,0.029544,0.043337,-0.017093,0.024376000000000002,-0.0018650000000000003,-0.009214999999999997,-0.014686000000000001,-0.014117000000000001,-0.014380999999999998,0.003583999999999997,0.05103500000000001,0.018846,-0.006967],"cost_curve":[0.32804475,0.220637,0.23234725,0.241198,0.44713325,0.28827025,0.300247,0.28032725,0.4495055,0.2969485,0.31818675,0.3118035,0.43214625,0.37936575,0.326105,0.33290775,0.41714975,0.346121,0.3538925,0.3726465]},{"run_name":"icl-claude-opus-4.7","task":"cohort_studies","run_index":4,"reward":-0.445516,"baseline_reward":-0.029840000000000016,"reference_reward":3.24404,"gain":-0.415676,"normalized_reward":-0.6401379837655699,"normalized_gain":-0.12696739037472357,"cost_usd":7.7291605,"latency_seconds":8.69537,"instance_count":20,"reward_curve":[0.000646,0.006271,0.013191,-0.019928,-0.026214,-0.004811,-0.003214,0.006315,0.018876,-0.061743,-0.014196,-0.06027,0.002378,-0.11368,-0.072255,-0.062182,-0.00625,0.003035,-0.047772,-0.003713],"baseline_reward_curve":[-0.029102,0.002496,-0.025936,0.003304,-0.012534,-0.002029,0.029004,-0.011323,-0.039821,0.000789,-0.013915,-0.009497,-0.029088,0.001071,-0.018565,0.008568,0.048265,0.021601,0.045961,0.000911],"gain_curve":[0.029748,0.0037749999999999997,0.039127,-0.023232000000000003,-0.013680000000000001,-0.002782,-0.032218,0.017638,0.058697,-0.062532,-0.000281,-0.050773,0.031466,-0.114751,-0.05369,-0.07075000000000001,-0.054515,-0.018566,-0.09373300000000001,-0.0046240000000000005],"cost_curve":[0.33589375,0.2464235,0.235329,0.2447575,0.56711975,0.3108045,0.3226635,0.335968,0.57831025,0.3806035,0.39745525,0.40931475,0.56804675,0.351399,0.358692,0.36614625,0.55367075,0.3809815,0.38905825,0.39652275]},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","run_index":0,"reward":15.333333333333336,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain":9.26666666666667,"normalized_reward":0.28433268858800786,"normalized_gain":0.27308447937131636,"cost_usd":5.9198455,"latency_seconds":4.201647,"instance_count":40,"reward_curve":[0.0,0.33333333333333337,0.0,0.6666666666666667,0.0,0.9333333333333333,0.6666666666666667,0.0,0.8,0.0,0.9333333333333333,0.33333333333333337,0.0,0.7333333333333334,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.7333333333333334,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.5333333333333333,0.0,0.19999999999999996,0.4666666666666667,0.0,0.1333333333333333,0.0,0.4,0.33333333333333337,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.6,0.0],"gain_curve":[0.0,0.33333333333333337,0.0,-0.06666666666666665,0.0,0.19999999999999996,0.0,-0.2666666666666667,0.4666666666666667,0.0,0.4,0.33333333333333337,-0.19999999999999996,0.2666666666666667,0.0,0.8,0.0,-0.4,0.6,0.0,0.0,0.0,0.0,0.6000000000000001,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.4,0.9333333333333333,0.33333333333333337,0.0],"cost_curve":[0.20715175,0.163341,0.078869,0.12212525,0.03917675,0.04130225,0.133421,0.1031975,0.0933125,0.071985,0.05099425,0.28828375,0.057024,0.14953375,0.53750025,0.0701725,0.33046375,0.07178275,0.0735865,0.15750075,0.1166275,0.287905,0.26564625,0.22829375,0.13606675,0.279718,0.0955165,0.09369575,0.2933395,0.10096825,0.153882,0.101476,0.154345,0.10560175,0.1097715,0.1089675,0.11118325,0.110846,0.11107275,0.11419825]},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","run_index":1,"reward":12.0,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain":5.933333333333334,"normalized_reward":0.18762088974854935,"normalized_gain":0.174852652259332,"cost_usd":5.4175505,"latency_seconds":4.71617,"instance_count":40,"reward_curve":[0.5333333333333333,0.6666666666666667,0.8,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.7333333333333334,0.7333333333333334,0.8,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8,0.8666666666666667,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.7333333333333334,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.5333333333333333,0.0,0.19999999999999996,0.4666666666666667,0.0,0.1333333333333333,0.0,0.4,0.33333333333333337,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.6,0.0],"gain_curve":[0.5333333333333333,0.6666666666666667,0.8,-0.7333333333333334,0.0,-0.7333333333333334,-0.6666666666666667,0.5333333333333333,-0.33333333333333337,0.0,-0.5333333333333333,0.0,-0.19999999999999996,0.2666666666666667,0.7333333333333334,0.6666666666666667,0.9333333333333333,-0.4,-0.33333333333333337,0.0,0.0,0.0,0.0,-0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8,0.8666666666666667,0.9333333333333333,0.0,-0.5333333333333333,0.9333333333333333,0.33333333333333337,0.9333333333333333],"cost_curve":[0.105948,0.0794645,0.06348325,0.0678895,0.27859325,0.0459595,0.1580275,0.09534175,0.162643,0.132909,0.081638,0.0517295,0.055896,0.14637375,0.15847425,0.1258935,0.065549,0.0996755,0.10243975,0.100305,0.21638875,0.2703595,0.12602925,0.11907225,0.12585025,0.0821545,0.08062925,0.4708165,0.0969175,0.1898755,0.32781775,0.09919225,0.1955535,0.147739,0.10336925,0.1039135,0.15848825,0.10655025,0.10783675,0.11076325]},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","run_index":2,"reward":21.200000000000006,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain":15.13333333333334,"normalized_reward":0.45454545454545475,"normalized_gain":0.4459724950884088,"cost_usd":4.8815765,"latency_seconds":4.793432,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.7333333333333334,0.9333333333333333,0.0,0.06666666666666665,0.0,0.9333333333333333,0.8,0.0,0.0,0.8666666666666667,0.4,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8,0.4666666666666667,0.9333333333333333,0.8,0.9333333333333333,0.0,0.0,0.7333333333333334,0.9333333333333333,0.0,0.8666666666666667,0.7333333333333334,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.7333333333333334,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.5333333333333333,0.0,0.19999999999999996,0.4666666666666667,0.0,0.1333333333333333,0.0,0.4,0.33333333333333337,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.6,0.0],"gain_curve":[0.0,0.0,0.0,0.1333333333333333,0.9333333333333333,0.0,0.2666666666666666,-0.2666666666666667,-0.2666666666666667,0.0,0.4,0.8,-0.19999999999999996,-0.4666666666666667,0.8666666666666667,0.2666666666666667,0.9333333333333333,0.5333333333333333,0.6,0.8,0.4666666666666667,0.9333333333333333,0.8,0.8,0.0,0.0,0.7333333333333334,0.9333333333333333,0.0,0.8666666666666667,0.7333333333333334,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,-0.5333333333333333,0.9333333333333333,-0.6,0.9333333333333333],"cost_curve":[0.0767865,0.17478475,0.065205,0.05013575,0.036156,0.0981615,0.03802075,0.1095515,0.34665875,0.08420125,0.05639225,0.1159065,0.11396925,0.09431425,0.08918325,0.32375625,0.06561125,0.0657885,0.068818,0.142137,0.33325425,0.080071,0.15450975,0.08539325,0.120226,0.0805825,0.21161,0.0927295,0.18940775,0.1374515,0.2365535,0.099255,0.098454,0.0957035,0.098691,0.098291,0.09744775,0.1004835,0.15399275,0.10193075]},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","run_index":3,"reward":11.866666666666669,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain":5.8000000000000025,"normalized_reward":0.18375241779497106,"normalized_gain":0.1709233791748527,"cost_usd":5.26035875,"latency_seconds":4.189432,"instance_count":40,"reward_curve":[0.0,0.8,0.0,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.8666666666666667,0.8,0.8666666666666667,0.4666666666666667,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.8,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.7333333333333334,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.5333333333333333,0.0,0.19999999999999996,0.4666666666666667,0.0,0.1333333333333333,0.0,0.4,0.33333333333333337,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.6,0.0],"gain_curve":[0.0,0.8,0.0,-0.7333333333333334,0.9333333333333333,0.19999999999999996,0.19999999999999996,-0.2666666666666667,-0.33333333333333337,0.8666666666666667,0.2666666666666667,0.8666666666666667,0.2666666666666667,-0.4666666666666667,0.0,-0.1333333333333333,0.0,0.5333333333333333,-0.33333333333333337,0.0,0.0,0.0,0.8,-0.1333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,-0.5333333333333333,0.8666666666666667,0.33333333333333337,0.0],"cost_curve":[0.094662,0.05397,0.17244225,0.067484,0.033631,0.03428475,0.05310125,0.03668875,0.05625625,0.0601645,0.0912335,0.0631995,0.20094975,0.079299,0.08150275,0.082866,0.053893,0.05415225,0.2697385,0.4308985,0.14014625,0.21856975,0.14988075,0.07897875,0.118602,0.08437625,0.284928,0.12195725,0.355717,0.094571,0.09106525,0.18827475,0.0974055,0.186815,0.14779925,0.2970065,0.1632755,0.15706825,0.1074675,0.10603675]},{"run_name":"icl-claude-opus-4.7","task":"database_exploration","run_index":4,"reward":17.86666666666667,"baseline_reward":6.066666666666666,"reference_reward":40.0,"gain":11.800000000000004,"normalized_reward":0.3578336557059963,"normalized_gain":0.34774066797642444,"cost_usd":4.61928875,"latency_seconds":4.454831,"instance_count":40,"reward_curve":[0.4666666666666667,0.0,0.8,0.0,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.5333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.9333333333333333,0.0,0.0,0.6666666666666667,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.8666666666666667,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.7333333333333334,0.0,0.7333333333333334,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.5333333333333333,0.0,0.19999999999999996,0.4666666666666667,0.0,0.1333333333333333,0.0,0.4,0.33333333333333337,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.6,0.0],"gain_curve":[0.4666666666666667,0.0,0.8,-0.7333333333333334,0.9333333333333333,0.1333333333333333,-0.6666666666666667,0.6666666666666666,-0.33333333333333337,0.9333333333333333,0.33333333333333337,0.0,-0.19999999999999996,-0.4666666666666667,0.8666666666666667,-0.1333333333333333,0.8666666666666667,0.5333333333333333,0.6,0.5333333333333333,0.0,0.0,0.0,-0.1333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.9333333333333333,0.0,0.0,0.6666666666666667,0.9333333333333333,0.9333333333333333,-0.5333333333333333,0.9333333333333333,0.2666666666666667,0.0],"cost_curve":[0.10849225,0.08301725,0.05502875,0.02973425,0.03166825,0.04706175,0.30478225,0.043197,0.0663775,0.04179025,0.06515575,0.04400525,0.1552205,0.05028775,0.082523,0.28325775,0.08481575,0.059138,0.06137725,0.2624655,0.34514,0.225391,0.11723725,0.1178115,0.11936125,0.125044,0.0811795,0.12918075,0.08410525,0.087555,0.0897665,0.08913625,0.09074375,0.2842485,0.090577,0.097524,0.1460935,0.09606625,0.14357575,0.10015575]},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","run_index":0,"reward":124.0,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain":-33.70000000000002,"normalized_reward":-0.009553189372076817,"normalized_gain":-0.03435970636215337,"cost_usd":17.6531885,"latency_seconds":4.726075,"instance_count":120,"reward_curve":[-2.0,1.0,-1.0,8.0,-1.0,1.0,8.0,-4.0,24.0,-11.0,10.0,-22.0,6.0,8.0,-4.0,-2.5,12.0,-1.0,9.0,20.0,-1.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-7.0,0.5,0.5,-4.0,0.0,2.0,10.0,-1.0,3.0,-1.0,1.0,-2.0,-8.0,10.0,-0.5,-0.5,2.0,12.0,2.5,-8.0,-8.0,5.0,6.0,2.0,-8.0,-1.0,-1.0,1.0,0.5,0.5,-2.5,-0.5,1.0,1.0,5.0,1.0,-2.5,3.5,16.0,0.5,-2.0,-2.0,-1.0,-0.5,0.5,-1.0,1.0,1.0,0.5,10.5,3.0,3.0,1.0,1.0,7.0,1.0,-2.0,0.5,1.0,1.0,-1.0,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,-1.0,-4.0,-1.0,2.0,-1.0,0.5,-11.0,-1.0,1.0,5.0,1.0,5.0,3.0,-1.0,-2.0,19.0,1.0,0.5,-4.0],"baseline_reward_curve":[-1.0,2.0,-1.0,8.0,-4.5,2.0,4.0,-2.2,9.9,-4.8,4.4,-10.0,4.4,18.0,-2.0,-9.0,4.8,-5.0,10.6,28.5,2.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,0.0,0.5,-1.0,-7.0,0.5,0.5,-4.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,1.0,2.0,3.5,2.5,-7.0,-8.0,5.4,3.7,3.5,-4.0,-4.0,-1.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-3.7,3.5,74.0,0.5,-4.0,-3.5,4.0,-2.2,0.5,-2.2,2.4,1.0,0.5,10.5,3.0,3.0,-4.6,1.0,4.0,1.0,-2.2,0.5,-4.0,-4.0,-1.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-2.0,-4.0,1.0,-1.0,2.0,0.5,-25.0,-2.0,2.0,3.0,2.0,2.0,3.0,5.0,2.0,23.0,1.0,0.5,-4.0],"gain_curve":[-1.0,-1.0,0.0,0.0,3.5,-1.0,4.0,-1.7999999999999998,14.1,-6.2,5.6,-12.0,1.5999999999999996,-10.0,-2.0,6.5,7.2,4.0,-1.5999999999999996,-8.5,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-4.0,-3.0,0.0,0.5,-1.5,0.0,8.5,0.0,-1.0,0.0,-0.40000000000000036,2.3,-1.5,-4.0,3.0,0.0,0.0,0.0,0.0,-0.5,-1.5,0.0,0.0,4.0,0.0,1.2000000000000002,0.0,-58.0,0.0,2.0,1.5,-5.0,1.7000000000000002,0.0,1.2000000000000002,-1.4,0.0,0.0,0.0,0.0,0.0,5.6,0.0,3.0,0.0,0.20000000000000018,0.0,5.0,5.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,1.0,0.0,-2.0,3.0,-3.0,0.0,14.0,1.0,-1.0,2.0,-1.0,3.0,0.0,-6.0,-4.0,-4.0,0.0,0.0,0.0],"cost_curve":[0.038695,0.0554225,0.0457015,0.0563185,0.05341525,0.05244675,0.0650455,0.0610485,0.06948375,0.0869425,0.081515,0.08468175,0.0854235,0.0877255,0.09044625,0.09886525,0.10502875,0.105189,0.1129045,0.1181015,0.0314705,0.209527,0.06079375,0.130785,0.065054,0.063347,0.0,0.03327575,0.0,0.03484125,0.13557025,0.0,0.0360035,0.17877125,0.0,0.0,0.1851975,0.150087,0.156379,0.2104985,0.120896,0.20687925,0.0847865,0.08634875,0.1746885,0.22640825,0.23494425,0.0474285,0.04687425,0.18800325,0.19601925,0.19471375,0.20648275,0.20578175,0.21551825,0.21920425,0.215708,0.223327,0.22485775,0.23172975,0.2350965,0.0,0.0,0.2413725,0.061579,0.06061925,0.1238415,0.247948,0.063076,0.25667625,0.255497,0.2701415,0.0,0.26871625,0.269304,0.272789,0.06883,0.0,0.2774755,0.070121,0.070421,0.0,0.28613775,0.14745925,0.14601875,0.07225425,0.0729,0.29381775,0.07515525,0.296692,0.0,0.07671575,0.07596475,0.300661,0.0,0.0,0.1574055,0.15305275,0.235323,0.15843725,0.0,0.1587675,0.31727025,0.3199395,0.330626,0.08377575,0.0,0.50694625,0.08534575,0.08482475,0.60773925,0.17358725,0.35124175,0.3536505,0.17584575,0.3555335,0.55901975,0.18462425,0.0,0.5563495]},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","run_index":1,"reward":62.0,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain":-95.70000000000002,"normalized_reward":-0.0712508707334063,"normalized_gain":-0.09757340946166397,"cost_usd":19.4668675,"latency_seconds":4.899692,"instance_count":120,"reward_curve":[10.8,7.0,1.0,-1.0,3.0,3.0,-2.2,-1.0,5.4,8.3,-3.0,24.0,2.4,-2.4,4.8,-11.4,-3.0,-1.0,-3.0,1.0,-2.0,-1.0,0.0,1.0,-1.0,0.5,10.0,-4.0,0.0,0.5,2.0,-1.0,0.5,-1.0,-1.0,0.5,-1.0,-2.0,1.0,0.5,-4.0,-0.5,-2.0,1.0,-1.0,10.0,1.0,-9.0,-0.5,2.0,-3.0,12.0,-6.0,-6.0,-6.0,2.2,2.2,2.0,9.6,-1.0,5.2,-1.0,-11.0,3.0,-1.0,-0.5,0.5,0.5,-2.2,3.0,0.5,-1.0,1.0,1.0,1.0,-2.2,1.0,3.0,-7.8,-2.4,0.5,0.5,2.4,0.5,-5.2,1.0,12.0,-6.0,-1.0,1.0,0.5,-2.2,1.0,1.0,4.2,-4.0,0.5,-1.0,5.0,-1.0,-21.0,1.0,2.0,0.5,-2.0,2.0,-4.0,-1.0,0.5,3.0,-2.0,-2.0,23.0,-1.0,4.0,0.5,3.0,1.0,3.0,3.0],"baseline_reward_curve":[-1.0,2.0,-1.0,8.0,-4.5,2.0,4.0,-2.2,9.9,-4.8,4.4,-10.0,4.4,18.0,-2.0,-9.0,4.8,-5.0,10.6,28.5,2.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,0.0,0.5,-1.0,-7.0,0.5,0.5,-4.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,1.0,2.0,3.5,2.5,-7.0,-8.0,5.4,3.7,3.5,-4.0,-4.0,-1.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-3.7,3.5,74.0,0.5,-4.0,-3.5,4.0,-2.2,0.5,-2.2,2.4,1.0,0.5,10.5,3.0,3.0,-4.6,1.0,4.0,1.0,-2.2,0.5,-4.0,-4.0,-1.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-2.0,-4.0,1.0,-1.0,2.0,0.5,-25.0,-2.0,2.0,3.0,2.0,2.0,3.0,5.0,2.0,23.0,1.0,0.5,-4.0],"gain_curve":[11.8,5.0,2.0,-9.0,7.5,1.0,-6.2,1.2000000000000002,-4.5,13.100000000000001,-7.4,34.0,-2.0000000000000004,-20.4,6.8,-2.4000000000000004,-7.8,4.0,-13.6,-27.5,-4.0,4.0,1.0,4.0,-2.0,1.5,9.5,-3.0,-0.5,1.5,2.0,-1.5,1.5,6.0,-1.5,0.0,3.0,-2.0,0.0,-9.5,-3.0,-3.5,-1.0,0.0,-3.0,15.0,-9.0,-8.0,-1.5,0.0,-6.5,9.5,1.0,2.0,-11.4,-1.5,-1.2999999999999998,6.0,13.6,0.0,4.2,-1.5,-11.5,5.0,-2.0,-1.5,-0.5,-0.5,-3.2,6.7,-3.0,-75.0,0.5,5.0,4.5,-6.2,3.2,2.5,-5.6,-4.8,-0.5,0.0,-8.1,-2.5,-8.2,5.6,11.0,-10.0,-2.0,3.2,0.0,1.7999999999999998,5.0,2.0,3.7,-4.5,-2.5,0.0,1.0,-4.0,-21.5,3.0,6.0,-0.5,-1.0,0.0,-4.5,24.0,2.5,1.0,-5.0,-4.0,21.0,-4.0,-1.0,-1.5,-20.0,0.0,2.5,7.0],"cost_curve":[0.038515,0.062865,0.0449415,0.0462825,0.05232,0.0606845,0.0621805,0.062515,0.07022125,0.0805565,0.0773765,0.080787,0.08631475,0.089441,0.093797,0.09887025,0.10006725,0.10132275,0.11188275,0.11189375,0.08609875,0.05713175,0.119694,0.0618705,0.060725,0.0,0.18009925,0.17541275,0.13826025,0.0,0.10720325,0.07026825,0.0,0.071887,0.07345675,0.0,0.11023075,0.1553615,0.08002825,0.0,0.20531375,0.04166875,0.124598,0.08679,0.08627375,0.22465625,0.0924445,0.27483725,0.04802825,0.20367925,0.19684375,0.202158,0.20185225,0.2200975,0.211132,0.2231425,0.2188225,0.21887475,0.22740575,0.226937,0.23548125,0.2379235,0.2456625,0.1234415,0.24370875,0.06304725,0.0,0.0,0.25210275,0.12718025,0.0,0.25477175,0.13439775,0.13222725,0.2672645,0.271424,0.06840225,0.13768675,0.290703,0.2879285,0.0,0.0,0.28872125,0.0,0.29148025,0.14719775,0.31238425,0.303104,0.3020765,0.1552455,0.0,0.31243775,0.0787285,0.15672475,0.31655325,0.48959775,0.0,0.16376175,0.585107,0.169931,0.51655425,0.17352,0.25910025,0.0,0.3473535,0.26341825,0.354309,0.35491625,0.0,0.181356,0.360939,0.36689925,0.5708355,0.18953725,0.2838365,0.0,0.191499,0.1915725,0.38782175,0.410876]},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","run_index":2,"reward":109.8,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain":-47.90000000000002,"normalized_reward":-0.023683948651607122,"normalized_gain":-0.04883768352365418,"cost_usd":15.325528,"latency_seconds":5.148204,"instance_count":120,"reward_curve":[9.9,-3.0,-4.4,4.8,-1.0,4.2,-3.0,-2.5,-9.6,1.0,22.0,-1.0,-1.0,9.6,3.0,2.0,5.5,-1.0,4.8,6.0,0.0,-0.5,1.0,-8.0,-0.5,0.5,2.0,-3.0,0.5,0.5,-1.0,1.0,1.0,-5.0,-1.0,0.5,-1.0,-1.0,-1.0,-0.5,-0.5,10.0,-0.5,10.0,-0.5,0.5,0.0,-1.0,-0.5,-0.5,4.4,2.0,-3.0,-6.0,-1.0,2.2,19.0,-1.0,4.6,-6.0,0.5,6.0,0.5,-1.0,0.5,1.0,-1.0,-1.0,-0.5,-0.5,1.0,11.5,-0.5,-2.2,-0.5,3.0,-1.0,1.0,-1.0,-0.5,1.0,1.0,0.5,1.0,12.0,0.5,0.5,-1.0,-2.0,-1.0,2.5,-0.5,-0.5,0.5,3.0,-0.5,1.0,-1.0,1.0,-1.0,0.5,3.0,-18.0,3.0,-1.0,2.0,-1.0,0.5,-1.0,21.0,2.0,-1.0,-0.5,-1.0,-1.0,0.5,-1.0,0.5,3.0,4.0],"baseline_reward_curve":[-1.0,2.0,-1.0,8.0,-4.5,2.0,4.0,-2.2,9.9,-4.8,4.4,-10.0,4.4,18.0,-2.0,-9.0,4.8,-5.0,10.6,28.5,2.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,0.0,0.5,-1.0,-7.0,0.5,0.5,-4.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,1.0,2.0,3.5,2.5,-7.0,-8.0,5.4,3.7,3.5,-4.0,-4.0,-1.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-3.7,3.5,74.0,0.5,-4.0,-3.5,4.0,-2.2,0.5,-2.2,2.4,1.0,0.5,10.5,3.0,3.0,-4.6,1.0,4.0,1.0,-2.2,0.5,-4.0,-4.0,-1.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-2.0,-4.0,1.0,-1.0,2.0,0.5,-25.0,-2.0,2.0,3.0,2.0,2.0,3.0,5.0,2.0,23.0,1.0,0.5,-4.0],"gain_curve":[10.9,-5.0,-3.4000000000000004,-3.2,3.5,2.2,-7.0,-0.2999999999999998,-19.5,5.8,17.6,9.0,-5.4,-8.4,5.0,11.0,0.7000000000000002,4.0,-5.8,-22.5,-2.0,4.5,2.0,-5.0,-1.5,1.5,1.5,-2.0,0.0,1.5,-1.0,0.5,2.0,2.0,-1.5,0.0,3.0,-1.0,-2.0,-10.5,0.5,7.0,0.5,9.0,-2.5,5.5,-10.0,0.0,-1.5,-2.5,0.9000000000000004,-0.5,4.0,2.0,-6.4,-1.5,15.5,3.0,8.6,-5.0,-0.5,5.5,0.0,1.0,-0.5,0.0,-2.0,-2.0,-1.5,3.2,-2.5,-62.5,-1.0,1.7999999999999998,3.0,-1.0,1.2000000000000002,0.5,1.2000000000000002,-2.9,0.0,0.5,-10.0,-2.0,9.0,5.1,-0.5,-5.0,-3.0,1.2000000000000002,2.0,3.5,3.5,1.5,2.5,-1.0,-2.0,0.0,-3.0,-4.0,0.0,5.0,-14.0,2.0,0.0,0.0,-1.5,25.5,1.0,19.0,-1.0,-3.0,-2.5,-4.0,-6.0,-1.5,-24.0,-0.5,2.5,8.0],"cost_curve":[0.0413,0.0575465,0.05660375,0.06010025,0.05551125,0.064904,0.06524025,0.07338675,0.07345875,0.080169,0.08249125,0.08121875,0.083264,0.09665675,0.101431,0.10149975,0.10956725,0.105059,0.11506775,0.12229475,0.125729,0.03337275,0.032065,0.16929375,0.0361305,0.0,0.14482075,0.14324325,0.0,0.0,0.0378105,0.072975,0.07262725,0.27605975,0.11396175,0.0,0.0784195,0.0812515,0.0804635,0.0409535,0.0403295,0.219236,0.04427175,0.2319905,0.0465715,0.0,0.177113,0.04678975,0.045685,0.046671,0.191276,0.1934525,0.1985605,0.201969,0.20306325,0.20523,0.21765275,0.214353,0.22079725,0.22695525,0.0,0.22721625,0.0,0.23018275,0.0,0.2367435,0.240605,0.24293125,0.06208725,0.06148175,0.06165825,0.25666225,0.06395425,0.2575155,0.06496675,0.12922025,0.25973175,0.06601,0.26287875,0.06741,0.06726875,0.13581225,0.0,0.06833825,0.284679,0.0,0.0,0.27860625,0.29064875,0.2885935,0.2991915,0.07537375,0.07365275,0.0,0.14983625,0.0754995,0.1517855,0.07493375,0.15207175,0.07644625,0.0,0.394888,0.40386775,0.3305385,0.081651,0.32643925,0.08270625,0.0,0.165044,0.51565425,0.25332725,0.3355105,0.0857345,0.174747,0.17312775,0.0,0.08729475,0.0,0.1757195,0.267369]},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","run_index":3,"reward":74.99999999999997,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain":-82.70000000000005,"normalized_reward":-0.05831426012538563,"normalized_gain":-0.08431892332789565,"cost_usd":17.25325525,"latency_seconds":4.819827,"instance_count":120,"reward_curve":[4.7,6.0,4.7,10.2,-2.2,-5.5,6.0,-5.5,-9.2,2.2,-3.0,2.2,-3.0,2.0,-1.0,-1.0,20.0,-11.2,6.0,28.0,-1.0,-1.0,-1.0,1.0,0.0,-2.0,-1.0,-1.0,-2.0,-1.0,2.0,0.5,0.5,1.0,0.5,-1.0,0.0,-5.0,-1.0,10.0,10.0,-1.0,-3.0,-0.5,0.5,-0.5,-0.5,-2.0,0.5,-0.5,-11.0,-1.0,4.7,10.4,1.0,2.0,-11.0,-2.2,9.7,-1.0,1.0,3.0,0.5,11.0,1.0,1.0,-2.4,4.7,1.0,0.5,-2.2,4.9,0.5,-2.2,1.0,-4.9,1.0,-4.4,0.5,-9.4,4.7,0.5,0.5,-4.9,-1.0,1.0,1.0,1.0,0.5,-2.2,1.0,-2.2,3.0,1.0,3.0,-18.0,21.0,-2.0,0.5,0.5,-1.0,2.0,4.0,0.5,1.0,2.0,3.0,-1.0,3.0,-1.0,0.5,2.0,-1.0,-0.5,-2.0,-1.0,-1.0,-1.0,-4.0,2.0],"baseline_reward_curve":[-1.0,2.0,-1.0,8.0,-4.5,2.0,4.0,-2.2,9.9,-4.8,4.4,-10.0,4.4,18.0,-2.0,-9.0,4.8,-5.0,10.6,28.5,2.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,0.0,0.5,-1.0,-7.0,0.5,0.5,-4.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,1.0,2.0,3.5,2.5,-7.0,-8.0,5.4,3.7,3.5,-4.0,-4.0,-1.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-3.7,3.5,74.0,0.5,-4.0,-3.5,4.0,-2.2,0.5,-2.2,2.4,1.0,0.5,10.5,3.0,3.0,-4.6,1.0,4.0,1.0,-2.2,0.5,-4.0,-4.0,-1.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-2.0,-4.0,1.0,-1.0,2.0,0.5,-25.0,-2.0,2.0,3.0,2.0,2.0,3.0,5.0,2.0,23.0,1.0,0.5,-4.0],"gain_curve":[5.7,4.0,5.7,2.1999999999999993,2.3,-7.5,2.0,-3.3,-19.1,7.0,-7.4,12.2,-7.4,-16.0,1.0,8.0,15.2,-6.199999999999999,-4.6,-0.5,-3.0,4.0,0.0,4.0,-1.0,-1.0,-1.5,0.0,-2.5,0.0,2.0,0.0,1.5,8.0,0.0,-1.5,4.0,-5.0,-2.0,0.0,11.0,-4.0,-2.0,-1.5,-1.5,4.5,-10.5,-1.0,-0.5,-2.5,-14.5,-3.5,11.7,18.4,-4.4,-1.7000000000000002,-14.5,1.7999999999999998,13.7,0.0,0.0,2.5,0.0,13.0,0.0,0.0,-3.4,3.7,0.0,4.2,-5.7,-69.1,0.0,1.7999999999999998,4.5,-8.9,3.2,-4.9,2.7,-11.8,3.7,0.0,-10.0,-7.9,-4.0,5.6,0.0,-3.0,-0.5,0.0,0.5,1.7999999999999998,7.0,2.0,2.5,-18.5,18.0,-1.0,-3.5,-2.5,-1.5,4.0,8.0,-0.5,2.0,0.0,2.5,24.0,5.0,-3.0,-2.5,0.0,-3.0,-3.5,-7.0,-3.0,-24.0,-2.0,-4.5,6.0],"cost_curve":[0.038695,0.063261,0.045734,0.04930125,0.05342225,0.06515525,0.0751035,0.07408125,0.069612,0.0783085,0.07747225,0.08517225,0.08563225,0.08738625,0.09307025,0.0960685,0.1018855,0.1069365,0.113345,0.1127855,0.11570525,0.05968625,0.08710375,0.06651025,0.125789,0.097628,0.03340625,0.066131,0.13665025,0.07145675,0.10795375,0.0,0.0,0.07259225,0.0,0.07274725,0.15061,0.2691635,0.08036475,0.211745,0.2215725,0.0856775,0.174963,0.04616475,0.0,0.04595025,0.0458365,0.13656025,0.0,0.04623025,0.191018,0.18973,0.206048,0.20785625,0.20196075,0.20263675,0.21673125,0.215886,0.2243055,0.22093925,0.11346675,0.1126985,0.0,0.23674275,0.117424,0.11685775,0.239711,0.243095,0.122898,0.0,0.24935175,0.2553325,0.0,0.264373,0.06661625,0.2649875,0.1347465,0.26834575,0.0,0.274004,0.28437225,0.0,0.0,0.2842285,0.28147625,0.144168,0.144425,0.07231225,0.0,0.29618925,0.1506765,0.3011825,0.152238,0.1509625,0.15336575,0.40233925,0.48607675,0.3233545,0.0,0.0,0.0820695,0.411051,0.250195,0.0,0.166513,0.24907775,0.1697565,0.17084725,0.43701975,0.1725905,0.0,0.176041,0.17373725,0.08801575,0.2647125,0.352872,0.090664,0.180487,0.549529,0.28232]},{"run_name":"icl-claude-opus-4.7","task":"exploitable_poker","run_index":4,"reward":212.60000000000002,"baseline_reward":157.70000000000002,"reference_reward":1138.5,"gain":54.900000000000006,"normalized_reward":0.07861478754104889,"normalized_gain":0.0559747145187602,"cost_usd":17.633791,"latency_seconds":5.135157,"instance_count":120,"reward_curve":[9.0,-5.0,4.0,35.5,-4.0,16.5,4.0,12.0,2.5,-6.5,-8.0,-1.0,16.5,47.0,-3.5,-1.0,-4.0,1.0,7.5,-1.0,1.0,0.5,10.0,1.0,-1.0,0.0,2.0,3.0,-5.0,10.0,-9.0,-1.0,0.5,-1.0,-4.0,4.0,1.0,2.0,-1.0,-1.0,0.5,-0.5,1.0,0.5,0.5,2.0,1.0,-1.0,-2.0,-1.0,-4.0,-8.0,-4.0,-8.0,13.5,1.0,-1.0,27.5,5.5,2.0,1.0,4.0,1.0,0.5,-4.7,1.0,1.0,1.0,1.0,0.5,-12.5,-4.0,1.0,-5.2,26.5,2.2,-2.2,0.5,0.5,1.0,0.5,1.0,1.0,1.0,-4.0,0.5,3.5,-0.5,19.0,1.0,0.5,3.5,3.5,1.0,1.0,1.0,-4.0,1.0,2.0,-4.0,0.5,4.5,2.0,-2.0,3.0,-2.0,21.0,0.5,0.5,-2.0,2.0,-1.0,4.0,3.5,0.5,-2.0,1.0,-18.0,-4.0,-1.0],"baseline_reward_curve":[-1.0,2.0,-1.0,8.0,-4.5,2.0,4.0,-2.2,9.9,-4.8,4.4,-10.0,4.4,18.0,-2.0,-9.0,4.8,-5.0,10.6,28.5,2.0,-5.0,-1.0,-3.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,0.0,0.5,-1.0,-7.0,0.5,0.5,-4.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,1.0,2.0,3.5,2.5,-7.0,-8.0,5.4,3.7,3.5,-4.0,-4.0,-1.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-3.7,3.5,74.0,0.5,-4.0,-3.5,4.0,-2.2,0.5,-2.2,2.4,1.0,0.5,10.5,3.0,3.0,-4.6,1.0,4.0,1.0,-2.2,0.5,-4.0,-4.0,-1.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-2.0,-4.0,1.0,-1.0,2.0,0.5,-25.0,-2.0,2.0,3.0,2.0,2.0,3.0,5.0,2.0,23.0,1.0,0.5,-4.0],"gain_curve":[10.0,-7.0,5.0,27.5,0.5,14.5,0.0,14.2,-7.4,-1.7000000000000002,-12.4,9.0,12.1,29.0,-1.5,8.0,-8.8,6.0,-3.0999999999999996,-29.5,-1.0,5.5,11.0,4.0,-2.0,1.0,1.5,4.0,-5.5,11.0,-9.0,-1.5,1.5,6.0,-4.5,3.5,5.0,2.0,-2.0,-11.0,1.5,-3.5,2.0,-0.5,-1.5,7.0,-9.0,0.0,-3.0,-3.0,-7.5,-10.5,3.0,0.0,8.1,-2.7,-4.5,31.5,9.5,3.0,0.0,3.5,0.5,2.5,-5.7,0.0,0.0,0.0,0.0,4.2,-16.0,-78.0,0.5,-1.2000000000000002,30.0,-1.7999999999999998,0.0,0.0,2.7,-1.4,-0.5,0.5,-9.5,-2.0,-7.0,5.1,2.5,-4.5,18.0,3.2,0.0,7.5,7.5,2.0,0.5,0.5,-7.0,2.0,-2.0,-7.0,0.0,6.5,6.0,-3.0,4.0,-4.0,20.5,25.5,2.5,-4.0,-1.0,-3.0,2.0,0.5,-4.5,-4.0,-22.0,-19.0,-4.5,3.0],"cost_curve":[0.042905,0.05768375,0.05165025,0.050892,0.05505825,0.0620495,0.06570975,0.07239,0.069494,0.07549725,0.093948,0.07989425,0.097006,0.0926295,0.100086,0.09450775,0.099484,0.108907,0.11362975,0.111492,0.05920425,0.0,0.166576,0.032541,0.06783225,0.127578,0.066278,0.17026425,0.24582575,0.1981905,0.19875925,0.078567,0.0,0.07849275,0.2118955,0.211914,0.085595,0.172582,0.12868275,0.08699,0.0,0.04499625,0.09044275,0.0,0.0,0.18700075,0.04750525,0.09311425,0.191627,0.096602,0.19874125,0.201372,0.20654425,0.2104275,0.21741175,0.21375725,0.222221,0.232564,0.234015,0.231981,0.119884,0.1183875,0.0594315,0.0,0.2561775,0.06538075,0.061921,0.061992,0.06232825,0.0,0.25785575,0.26344125,0.1969225,0.273068,0.273758,0.276463,0.27737675,0.0,0.0,0.14376125,0.0,0.14174975,0.07073475,0.0713175,0.28630725,0.0,0.14567175,0.073604,0.30715275,0.07669975,0.0,0.14968475,0.14997925,0.075165,0.0760835,0.1543075,0.31036375,0.1581285,0.31673775,0.401326,0.0,0.24461675,0.24301925,0.327981,0.3350535,0.33308675,0.52962025,0.0,0.0,0.43491075,0.17968225,0.349775,0.178411,0.17981875,0.0,0.360495,0.09136775,0.4639565,0.55845,0.1873785]},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","run_index":0,"reward":9.5489,"baseline_reward":4.3896,"reference_reward":12.0,"gain":5.1593,"normalized_reward":0.6171771283989567,"normalized_gain":0.6779275727951224,"cost_usd":4.398936,"latency_seconds":9.479297,"instance_count":12,"reward_curve":[0.4428,0.7374,0.8281,0.7698,0.764,0.8074,0.7973,0.888,0.8287,0.8561,0.8973,0.932],"baseline_reward_curve":[0.3475,0.272,0.3812,0.501,0.3545,0.3055,0.5749,0.3912,0.3331,0.3249,0.2714,0.3324],"gain_curve":[0.09530000000000005,0.46540000000000004,0.44689999999999996,0.26880000000000004,0.40950000000000003,0.5019,0.22240000000000004,0.4968,0.4956,0.5311999999999999,0.6259,0.5996000000000001],"cost_curve":[0.3630835,0.436219,0.33503525,0.3500485,0.338965,0.3626465,0.336095,0.3567765,0.37614575,0.366159,0.38093425,0.39682775]},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","run_index":1,"reward":8.113199999999999,"baseline_reward":4.3896,"reference_reward":12.0,"gain":3.7235999999999994,"normalized_reward":0.39294360191794075,"normalized_gain":0.4892778303374329,"cost_usd":4.215115,"latency_seconds":9.520929,"instance_count":12,"reward_curve":[0.5377,0.6869,0.7275,0.7031,0.6986,0.6857,0.6786,0.704,0.6726,0.6682,0.6862,0.6641],"baseline_reward_curve":[0.3475,0.272,0.3812,0.501,0.3545,0.3055,0.5749,0.3912,0.3331,0.3249,0.2714,0.3324],"gain_curve":[0.19019999999999998,0.41489999999999994,0.34630000000000005,0.20209999999999995,0.3441,0.3802,0.10370000000000001,0.31279999999999997,0.33949999999999997,0.3433,0.41480000000000006,0.33170000000000005],"cost_curve":[0.25651475,0.42033625,0.33125325,0.3503495,0.2941165,0.3118905,0.32936575,0.35057925,0.36498975,0.38427225,0.40191475,0.4195325]},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","run_index":2,"reward":8.5972,"baseline_reward":4.3896,"reference_reward":12.0,"gain":4.207600000000001,"normalized_reward":0.4685367110750154,"normalized_gain":0.5528750131399139,"cost_usd":4.833654,"latency_seconds":9.140023,"instance_count":12,"reward_curve":[0.3957,0.7455,0.7567,0.7563,0.7579,0.7172,0.7266,0.7747,0.743,0.7189,0.7478,0.7569],"baseline_reward_curve":[0.3475,0.272,0.3812,0.501,0.3545,0.3055,0.5749,0.3912,0.3331,0.3249,0.2714,0.3324],"gain_curve":[0.04820000000000002,0.47350000000000003,0.37550000000000006,0.25529999999999997,0.40340000000000004,0.41169999999999995,0.15170000000000006,0.38350000000000006,0.4099,0.39399999999999996,0.47640000000000005,0.42450000000000004],"cost_curve":[0.23259875,0.3429215,0.355446,0.3566195,0.379427,0.375217,0.40062175,0.4291815,0.4520715,0.47704225,0.502771,0.52973625]},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","run_index":3,"reward":9.364099999999999,"baseline_reward":4.3896,"reference_reward":12.0,"gain":4.974499999999999,"normalized_reward":0.588314304902619,"normalized_gain":0.6536450120887205,"cost_usd":3.991167,"latency_seconds":9.334106,"instance_count":12,"reward_curve":[0.3856,0.6078,0.8131,0.7579,0.8029,0.8461,0.7957,0.8807,0.8291,0.8366,0.8922,0.9164],"baseline_reward_curve":[0.3475,0.272,0.3812,0.501,0.3545,0.3055,0.5749,0.3912,0.3331,0.3249,0.2714,0.3324],"gain_curve":[0.03810000000000002,0.3358,0.43190000000000006,0.2569,0.44839999999999997,0.5406,0.2208,0.48950000000000005,0.49599999999999994,0.5117,0.6208,0.5840000000000001],"cost_curve":[0.29071775,0.4234325,0.339513,0.29265725,0.3069035,0.29387175,0.30216525,0.31891125,0.33453225,0.349051,0.36192875,0.37748275]},{"run_name":"icl-claude-opus-4.7","task":"sales_prediction","run_index":4,"reward":9.5706,"baseline_reward":4.3896,"reference_reward":12.0,"gain":5.181000000000001,"normalized_reward":0.6205663235822388,"normalized_gain":0.6807789340901925,"cost_usd":4.352254,"latency_seconds":10.258779,"instance_count":12,"reward_curve":[0.3865,0.7296,0.8351,0.7639,0.8005,0.8111,0.8567,0.884,0.8735,0.8679,0.8694,0.8924],"baseline_reward_curve":[0.3475,0.272,0.3812,0.501,0.3545,0.3055,0.5749,0.3912,0.3331,0.3249,0.2714,0.3324],"gain_curve":[0.039000000000000035,0.4576,0.45389999999999997,0.2629,0.446,0.5056,0.28180000000000005,0.4928,0.5404,0.5429999999999999,0.598,0.56],"cost_curve":[0.30643825,0.4204925,0.34734525,0.30641125,0.28060325,0.374999,0.3382925,0.35736525,0.37505025,0.394551,0.4149095,0.435796]},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":0,"reward":37.5052,"baseline_reward":19.7597,"reference_reward":90.0,"gain":17.745500000000003,"normalized_reward":0.2526260339697319,"normalized_gain":0.25263986628758706,"cost_usd":3.7048572,"latency_seconds":9.842013,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3669,0.3495,0.3612,0.3286,0.2831,0.3398,0.3406,0.3804,0.3609,0.3416,0.4461,0.335,0.3901,0.4137,0.3758,0.33,0.401,0.3929,0.3769,0.4048,0.4155,0.45,0.3613,0.3595,0.3333,0.2799,0.3724,0.3521,0.354,0.3116,0.3228,0.2913,0.329,0.3895,0.3986,0.3808,0.4428,0.4757,0.477,0.4857,0.5099,0.5316,0.5006,0.5177,0.5628,0.5508,0.5219,0.5469,0.4202,0.4303,0.42,0.4189,0.4739,0.4334,0.4469,0.4122,0.3594,0.4874,0.5196,0.4798,0.4075,0.4271,0.4128,0.4137,0.4961,0.4593,0.4646,0.5031,0.582,0.5598,0.4849,0.4385,0.4508,0.5287,0.5265,0.5908,0.5078,0.5422,0.4315,0.3373,0.3725,0.3979,0.3841,0.352,0.3435,0.4108,0.3779],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1405,0.12539999999999998,0.14840000000000003,0.1013,0.08810000000000001,0.1177,0.128,0.14,0.1324,0.12230000000000002,0.1978,0.14300000000000002,0.1927,0.18980000000000002,0.14880000000000002,0.12350000000000003,0.15360000000000001,0.19110000000000002,0.17500000000000002,0.1918,0.20719999999999997,0.22560000000000002,0.128,0.15009999999999998,0.12279999999999999,0.04869999999999999,0.1652,0.15390000000000004,0.1455,0.1021,0.12009999999999998,0.0678,0.11510000000000001,0.18660000000000002,0.1572,0.18350000000000002,0.22250000000000003,0.24930000000000002,0.2844,0.24600000000000002,0.2883,0.30429999999999996,0.27320000000000005,0.2962,0.3319,0.31749999999999995,0.2932,0.32920000000000005,0.19870000000000002,0.22280000000000003,0.20729999999999998,0.1943,0.24869999999999998,0.2336,0.21080000000000002,0.2167,0.14379999999999998,0.2455,0.3081999999999999,0.2632,0.18649999999999997,0.22899999999999998,0.1973,0.1865,0.2409,0.25049999999999994,0.2434,0.249,0.36849999999999994,0.3126,0.2546,0.2177,0.21309999999999998,0.2865,0.3136,0.34199999999999997,0.30810000000000004,0.33430000000000004,0.2139,0.1207,0.1624,0.17859999999999998,0.1837,0.15239999999999998,0.14180000000000004,0.1666,0.1557],"cost_curve":[0.005148,0.010185,0.014211,0.02347725,0.01758705,0.01690395,0.0183603,0.0190476,0.01985775,0.02159865,0.02291685,0.0233784,0.0236115,0.02514045,0.02692125,0.0270765,0.02729685,0.02732235,0.0276213,0.02805885,0.0283878,0.02874855,0.029283,0.0303792,0.0316437,0.03216195,0.0331521,0.03311355,0.03364425,0.03385185,0.03513165,0.0352575,0.0352437,0.03596565,0.03644205,0.03621585,0.03652755,0.03700005,0.03749835,0.0381651,0.0388725,0.0396954,0.04025565,0.04121115,0.04137255,0.04179855,0.04207725,0.04277145,0.04326195,0.04324665,0.04323045,0.0439005,0.04417335,0.0446307,0.04643175,0.04792185,0.04833735,0.0479955,0.0482853,0.0489558,0.05006445,0.05065575,0.05074845,0.05127705,0.05155485,0.0520014,0.05276295,0.0532953,0.0537834,0.05431275,0.0548196,0.0558915,0.0576108,0.0572301,0.05721585,0.0576348,0.05819265,0.0583122,0.0587994,0.05953215,0.06000555,0.06052545,0.06174735,0.0613242,0.0628761,0.0634095,0.0642036,0.0641001,0.06395445,0.0650556]},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":1,"reward":36.330099999999995,"baseline_reward":19.7597,"reference_reward":90.0,"gain":16.570399999999996,"normalized_reward":0.23589601218696152,"normalized_gain":0.23591015414228006,"cost_usd":3.39641535,"latency_seconds":9.271725,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2484,0.2674,0.2732,0.2627,0.2793,0.3361,0.3319,0.3755,0.4601,0.4866,0.4418,0.4305,0.4158,0.4217,0.4067,0.4075,0.4149,0.4658,0.4433,0.4786,0.4113,0.4311,0.3483,0.3591,0.4042,0.4009,0.3738,0.437,0.3867,0.4011,0.421,0.4644,0.467,0.5558,0.5398,0.4809,0.4993,0.475,0.5533,0.5368,0.5387,0.483,0.4536,0.4387,0.4463,0.4259,0.4039,0.4586,0.4102,0.3507,0.4059,0.3993,0.4603,0.3998,0.4336,0.3957,0.4066,0.3969,0.3716,0.4033,0.4662,0.4857,0.4378,0.4085,0.3712,0.3813,0.3839,0.338,0.322,0.3179,0.3652,0.4062,0.4376,0.4128,0.4162,0.3822,0.3944,0.3676,0.3617,0.3552,0.3223,0.2497,0.3977,0.3685,0.4464,0.4197,0.4,0.4053],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03670000000000001,0.041000000000000036,0.049100000000000005,0.0499,0.05199999999999999,0.1411,0.10979999999999998,0.1629,0.2197,0.2581,0.22250000000000003,0.1822,0.2238,0.22430000000000003,0.18280000000000002,0.18049999999999997,0.2084,0.21839999999999998,0.24150000000000002,0.27670000000000006,0.1983,0.22279999999999997,0.12390000000000001,0.12579999999999997,0.1948,0.19039999999999999,0.14260000000000003,0.2298,0.1885,0.19260000000000002,0.2115,0.2617,0.24350000000000002,0.3419,0.3369,0.2395,0.30200000000000005,0.2547,0.3269,0.34420000000000006,0.29899999999999993,0.26139999999999997,0.2263,0.2113,0.22479999999999997,0.195,0.17059999999999997,0.22990000000000002,0.1925,0.1292,0.1984,0.1866,0.2357,0.17459999999999998,0.23379999999999998,0.1596,0.2111,0.18129999999999996,0.12969999999999998,0.1919,0.24960000000000002,0.26470000000000005,0.23970000000000002,0.19299999999999998,0.14399999999999996,0.1261,0.1751,0.11680000000000001,0.06790000000000002,0.10440000000000002,0.11800000000000002,0.1759,0.2168,0.1751,0.17400000000000002,0.16929999999999998,0.14559999999999998,0.1679,0.15380000000000002,0.13760000000000003,0.10569999999999999,0.039599999999999996,0.1784,0.1681,0.24680000000000002,0.21800000000000003,0.15580000000000002,0.18309999999999998],"cost_curve":[0.007044,0.01185,0.01761,0.01514235,0.0150528,0.01487685,0.01517955,0.01650705,0.01723575,0.01815165,0.0194286,0.02057505,0.02051385,0.02045415,0.021411,0.02250195,0.02270925,0.02271765,0.0246045,0.02507865,0.02548125,0.02714625,0.027312,0.0267831,0.0273234,0.0278106,0.02820765,0.02854935,0.0286617,0.02951805,0.03017625,0.030669,0.03087585,0.03165945,0.03198645,0.0320547,0.03203175,0.03233265,0.03387375,0.03589185,0.0358074,0.03552585,0.03503715,0.0360636,0.0369432,0.0362136,0.03624285,0.0369129,0.0375144,0.039465,0.0400719,0.03951765,0.039621,0.04020735,0.04238625,0.04297335,0.04320075,0.04443585,0.04524285,0.0456279,0.0463848,0.04666095,0.04739895,0.0472503,0.04770225,0.04852605,0.0504489,0.05098395,0.05228625,0.05175255,0.05181615,0.05257035,0.05219175,0.0543387,0.05466885,0.0543891,0.05486325,0.05572845,0.05692365,0.0569253,0.05789505,0.05821305,0.05916195,0.0598245,0.06036255,0.0623742,0.0621672,0.0616089,0.0620265,0.06297045]},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":2,"reward":37.13590000000001,"baseline_reward":19.7597,"reference_reward":90.0,"gain":17.376200000000008,"normalized_reward":0.24736827118837118,"normalized_gain":0.24738220081634058,"cost_usd":3.1266597,"latency_seconds":7.74246,"instance_count":90,"reward_curve":[0.2482,0.2809,0.3134,0.2932,0.3445,0.3268,0.3953,0.4128,0.4328,0.414,0.4561,0.4447,0.4587,0.4142,0.4169,0.431,0.3998,0.3861,0.3528,0.3507,0.335,0.3286,0.3311,0.3127,0.307,0.3423,0.3434,0.3415,0.3468,0.3801,0.3983,0.4145,0.4231,0.4039,0.3351,0.3704,0.4314,0.4537,0.4336,0.4462,0.4159,0.3878,0.4646,0.4528,0.4696,0.4787,0.537,0.5404,0.5344,0.5291,0.4614,0.5105,0.508,0.4734,0.4058,0.4161,0.4798,0.4587,0.5006,0.4279,0.4227,0.3766,0.3446,0.3939,0.3721,0.3354,0.3505,0.3481,0.4048,0.4441,0.3885,0.3797,0.363,0.3622,0.347,0.4013,0.4244,0.4609,0.4237,0.434,0.4385,0.5065,0.5289,0.5595,0.5553,0.4619,0.4414,0.4502,0.4547,0.4574],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.03269999999999998,0.10170000000000001,0.06680000000000003,0.12039999999999998,0.11399999999999999,0.16799999999999998,0.2178,0.21070000000000003,0.20139999999999997,0.2157,0.21619999999999998,0.2394,0.16590000000000002,0.2249,0.2336,0.1759,0.1591,0.1463,0.1033,0.1332,0.1267,0.11810000000000001,0.10439999999999997,0.0826,0.10899999999999999,0.13399999999999998,0.13100000000000003,0.11560000000000001,0.1729,0.2001,0.206,0.21359999999999998,0.2012,0.1116,0.1565,0.2285,0.2123,0.23629999999999998,0.2259,0.1895,0.19519999999999998,0.22490000000000002,0.2312,0.24230000000000002,0.2513,0.3155,0.3095,0.3011,0.3004,0.24369999999999997,0.2889999999999999,0.3005,0.2607,0.1812,0.19090000000000001,0.28,0.2226,0.30510000000000004,0.2123,0.18080000000000002,0.16519999999999999,0.12800000000000003,0.17289999999999997,0.174,0.11989999999999998,0.12329999999999997,0.09290000000000004,0.19599999999999998,0.2229,0.13440000000000002,0.1662,0.11579999999999999,0.13190000000000002,0.12619999999999998,0.1636,0.1822,0.24799999999999997,0.17490000000000003,0.2343,0.2306,0.28889999999999993,0.3123,0.3494,0.336,0.26149999999999995,0.24180000000000001,0.2485,0.2105,0.23519999999999996],"cost_curve":[0.005562,0.00906,0.013413,0.02029275,0.0158502,0.0165192,0.0170214,0.0190602,0.01983525,0.0194409,0.01975425,0.01972605,0.0199071,0.02049225,0.0222129,0.02307885,0.0233622,0.0225987,0.02299215,0.024384,0.0242172,0.0244386,0.02468175,0.02482875,0.02525565,0.0263655,0.02643645,0.02597325,0.02679105,0.0273669,0.0278424,0.0278487,0.0283566,0.03068295,0.0307404,0.02981985,0.0306756,0.03159795,0.03305235,0.03362535,0.0336729,0.0331386,0.0336234,0.03408015,0.0342774,0.03490155,0.03515745,0.0354798,0.03597375,0.03592365,0.0367104,0.0374016,0.0389019,0.0392934,0.0387759,0.0391428,0.04052385,0.0404169,0.04121445,0.0413046,0.04134885,0.0425664,0.04402125,0.0443559,0.0433887,0.04335,0.04424115,0.04521225,0.04520805,0.0455289,0.0470802,0.04725165,0.0469017,0.04731135,0.04812075,0.04863705,0.04864275,0.04908135,0.05043495,0.0511143,0.05093445,0.0507858,0.0510963,0.0518484,0.05207895,0.05260605,0.0530037,0.05361405,0.05559345,0.056226]},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":3,"reward":32.108399999999996,"baseline_reward":19.7597,"reference_reward":90.0,"gain":12.348699999999997,"normalized_reward":0.17579122709605763,"normalized_gain":0.1758064814643445,"cost_usd":3.62089155,"latency_seconds":10.365783,"instance_count":90,"reward_curve":[0.192,0.2393,0.2287,0.2413,0.2439,0.2404,0.2894,0.3456,0.3396,0.3373,0.3166,0.3171,0.3143,0.3246,0.3238,0.384,0.3756,0.3859,0.3693,0.3762,0.3651,0.3787,0.3606,0.3922,0.3506,0.3439,0.4136,0.3629,0.4307,0.4592,0.4826,0.4861,0.4558,0.5316,0.4934,0.4695,0.4594,0.4413,0.4382,0.466,0.4611,0.4426,0.4178,0.4211,0.3829,0.3756,0.326,0.3103,0.2859,0.2689,0.267,0.3203,0.3363,0.3628,0.3776,0.3952,0.3539,0.3355,0.3514,0.2886,0.3189,0.318,0.3432,0.3353,0.324,0.2999,0.275,0.3234,0.3626,0.3188,0.3061,0.3055,0.3081,0.2559,0.2253,0.3031,0.3969,0.3782,0.3442,0.3662,0.3554,0.3157,0.3021,0.3345,0.3784,0.451,0.4383,0.429,0.4225,0.4258],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.016999999999999987,0.014899999999999997,0.019800000000000012,0.027600000000000013,0.06209999999999999,0.1506,0.11750000000000002,0.12469999999999998,0.07619999999999999,0.08859999999999998,0.09500000000000003,0.0763,0.13179999999999997,0.18660000000000002,0.1517,0.1589,0.16280000000000003,0.12879999999999997,0.16329999999999997,0.17679999999999998,0.14759999999999998,0.18389999999999998,0.12620000000000003,0.11059999999999998,0.20420000000000002,0.1524,0.19950000000000004,0.252,0.2844,0.27759999999999996,0.2463,0.32889999999999997,0.26990000000000003,0.25559999999999994,0.25649999999999995,0.19990000000000002,0.24089999999999998,0.24570000000000003,0.23470000000000002,0.25,0.1781,0.19949999999999998,0.15560000000000002,0.1482,0.10450000000000001,0.07940000000000003,0.05259999999999998,0.040199999999999986,0.04930000000000001,0.09879999999999997,0.1288,0.1501,0.153,0.16999999999999998,0.1541,0.09940000000000002,0.15589999999999998,0.07300000000000001,0.07700000000000001,0.1066,0.12660000000000002,0.11429999999999998,0.1259,0.0844,0.04780000000000001,0.06820000000000004,0.15379999999999996,0.09759999999999996,0.05199999999999999,0.092,0.06089999999999998,0.02560000000000001,0.004500000000000004,0.06539999999999999,0.15469999999999998,0.16529999999999997,0.09540000000000001,0.16650000000000004,0.1475,0.09809999999999999,0.08549999999999999,0.12440000000000001,0.15910000000000002,0.25060000000000004,0.23870000000000002,0.2273,0.1783,0.2036],"cost_curve":[0.00627,0.010956,0.0165135,0.01350285,0.0137013,0.0140727,0.01475805,0.01596225,0.01618215,0.0167805,0.01677495,0.01707285,0.0189828,0.0194223,0.0199821,0.02064615,0.020607,0.02168175,0.0241599,0.0244002,0.02406585,0.0254466,0.0258303,0.0262347,0.0268683,0.02675265,0.02744235,0.0294702,0.0300303,0.030624,0.03119445,0.03264165,0.0331194,0.0334986,0.0351546,0.03549345,0.0347814,0.0353004,0.03655455,0.0364641,0.03713025,0.03869175,0.0407283,0.04127865,0.04135785,0.0419268,0.04201305,0.0423084,0.04279695,0.04309755,0.04414185,0.04466115,0.0456306,0.0469698,0.04750785,0.0473541,0.0480543,0.04900125,0.04939545,0.04910445,0.0498744,0.0506037,0.0511647,0.05170365,0.05240535,0.0533526,0.05387325,0.0549021,0.0550089,0.0559086,0.05606865,0.0565947,0.05786295,0.0585483,0.05846025,0.05844435,0.06028965,0.06100575,0.0611322,0.06247095,0.0633654,0.06452625,0.0649599,0.06515385,0.06508815,0.06624765,0.06592035,0.066327,0.067374,0.0697065]},{"run_name":"icl-claude-sonnet-4.6","task":"blind_spectrum_monitoring","run_index":4,"reward":39.84159999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":20.081899999999994,"normalized_reward":0.28588960548982745,"normalized_gain":0.2859028221690396,"cost_usd":4.12834035,"latency_seconds":11.99642,"instance_count":90,"reward_curve":[0.2273,0.2631,0.3401,0.3607,0.3013,0.3666,0.346,0.3662,0.3333,0.3894,0.3527,0.3801,0.4332,0.4332,0.3837,0.3652,0.3652,0.3652,0.3705,0.3571,0.3571,0.3571,0.3571,0.3646,0.3646,0.3646,0.4502,0.4502,0.4502,0.4502,0.4502,0.4502,0.4754,0.4877,0.4877,0.5025,0.5025,0.4969,0.4969,0.4969,0.4969,0.4869,0.4588,0.4588,0.4588,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793,0.4793],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.014899999999999997,0.12840000000000001,0.13430000000000003,0.07720000000000002,0.1538,0.11869999999999997,0.17120000000000002,0.1112,0.1768,0.11230000000000001,0.15159999999999998,0.21389999999999998,0.18489999999999998,0.19169999999999998,0.16780000000000003,0.14130000000000004,0.13820000000000002,0.164,0.10969999999999996,0.15529999999999997,0.15519999999999998,0.14409999999999998,0.15629999999999997,0.1402,0.13129999999999997,0.2408,0.2397,0.219,0.243,0.252,0.2417,0.2659,0.28500000000000003,0.2642,0.28859999999999997,0.2996,0.2555,0.2996,0.2766,0.2705,0.2943,0.2191,0.2372,0.23149999999999998,0.2519,0.25780000000000003,0.2484,0.246,0.25060000000000004,0.2616,0.25780000000000003,0.27180000000000004,0.2666,0.25470000000000004,0.2541,0.27949999999999997,0.2432,0.2838,0.2637,0.2374,0.2679,0.26270000000000004,0.2583,0.2812,0.26380000000000003,0.2521,0.22410000000000002,0.27049999999999996,0.2581,0.2252,0.26580000000000004,0.2321,0.249,0.2585,0.2416,0.2371,0.26639999999999997,0.2305,0.2796,0.2714,0.26170000000000004,0.26270000000000004,0.2692,0.26,0.27890000000000004,0.2797,0.2776,0.2351,0.2571],"cost_curve":[0.006978,0.010884,0.0182145,0.01867995,0.02129565,0.0221721,0.02406915,0.024555,0.0256899,0.02791395,0.02852805,0.0285624,0.0292971,0.0294429,0.0298785,0.03121815,0.03166845,0.0317259,0.03248475,0.0332055,0.03357615,0.03383325,0.03421845,0.03284775,0.03333105,0.0340791,0.03455895,0.03509145,0.03554415,0.0364779,0.03705255,0.0373005,0.03890145,0.0396996,0.03975435,0.0399426,0.04037715,0.0415545,0.04234665,0.04260585,0.0429648,0.0446844,0.04507365,0.0447345,0.04549395,0.04694355,0.04772175,0.0478827,0.04837755,0.04895115,0.04907685,0.04991085,0.05054325,0.0506373,0.05156895,0.0521349,0.05303715,0.05390025,0.05383455,0.05393985,0.05459775,0.05558295,0.055953,0.0566853,0.05751555,0.0577932,0.0587304,0.05919735,0.0592785,0.06005295,0.060216,0.0606699,0.06200925,0.0626616,0.06297915,0.06354195,0.06389745,0.0648645,0.0658875,0.06610665,0.06623115,0.0667569,0.06729465,0.067875,0.06843105,0.06897315,0.0693579,0.07040595,0.0718863,0.0719382]},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","run_index":0,"reward":9.25,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain":2.1999999999999993,"normalized_reward":-0.020942408376963276,"normalized_gain":0.1841004184100418,"cost_usd":4.295421,"latency_seconds":4.810173,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.9,0.0,0.925,0.0,0.0,0.65,0.925,0.325,0.85,0.925,0.55,0.775,0.775,0.775,0.875],"baseline_reward_curve":[0.0,0.0,0.0,0.525,0.825,0.075,0.825,0.425,0.0,0.5,0.7,0.25,0.9,0.825,0.0,0.45,0.0,0.0,0.75],"gain_curve":[0.0,0.0,0.0,-0.525,0.07500000000000007,-0.075,0.10000000000000009,-0.425,0.0,0.15000000000000002,0.2250000000000001,0.07500000000000001,-0.050000000000000044,0.10000000000000009,0.55,0.325,0.775,0.775,0.125],"cost_curve":[0.07235505,0.0794547,0.05743185,0.06056565,0.0519318,0.11156955,0.07240245,0.17726745,0.08302305,0.356694,0.0921441,0.71793855,0.18077025,0.13674765,0.65496495,0.38950725,0.35612085,0.4029426,0.24158925]},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","run_index":1,"reward":10.425,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain":3.375,"normalized_reward":0.10209424083769647,"normalized_gain":0.28242677824267787,"cost_usd":7.04175165,"latency_seconds":5.379357,"instance_count":19,"reward_curve":[0.85,0.7,0.675,0.0,0.8,0.0,0.625,0.9,0.0,0.5,0.95,0.225,0.25,0.8,0.975,0.9,0.4,0.875,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.525,0.825,0.075,0.825,0.425,0.0,0.5,0.7,0.25,0.9,0.825,0.0,0.45,0.0,0.0,0.75],"gain_curve":[0.85,0.7,0.675,-0.525,-0.02499999999999991,-0.075,-0.19999999999999996,0.47500000000000003,0.0,0.0,0.25,-0.024999999999999994,-0.65,-0.02499999999999991,0.975,0.45,0.4,0.875,-0.75],"cost_curve":[0.0959145,0.10973205,0.21977355,0.15103245,0.1309947,0.1680939,0.4440357,0.11526705,0.1518513,0.5726154,0.0924981,1.12686045,1.18093755,0.4117047,0.08870445,0.2332578,1.1888337,0.3036813,0.255963]},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","run_index":2,"reward":9.699999999999998,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain":2.649999999999997,"normalized_reward":0.026178010471204,"normalized_gain":0.22175732217573196,"cost_usd":6.88130145,"latency_seconds":4.941337,"instance_count":19,"reward_curve":[0.725,0.3,0.0,0.0,0.725,0.825,0.6,0.0,0.9,0.475,0.375,0.725,0.825,0.0,0.85,0.0,0.825,0.6,0.95],"baseline_reward_curve":[0.0,0.0,0.0,0.525,0.825,0.075,0.825,0.425,0.0,0.5,0.7,0.25,0.9,0.825,0.0,0.45,0.0,0.0,0.75],"gain_curve":[0.725,0.3,0.0,-0.525,-0.09999999999999998,0.75,-0.22499999999999998,-0.425,0.9,-0.025000000000000022,-0.32499999999999996,0.475,-0.07500000000000007,-0.825,0.85,-0.45,0.825,0.6,0.19999999999999996],"cost_curve":[0.0825753,0.388479,0.24290985,0.1283088,0.2383053,0.15549825,0.3825261,0.3645216,0.14594775,0.6974496,0.9051795,0.48291975,0.30097995,0.4962018,0.3046542,0.2171385,0.3595719,0.83384355,0.15429075]},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","run_index":3,"reward":9.325,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain":2.2749999999999986,"normalized_reward":-0.013089005235602092,"normalized_gain":0.1903765690376568,"cost_usd":7.5331347,"latency_seconds":5.483116,"instance_count":19,"reward_curve":[0.2,0.725,0.0,0.825,0.0,0.0,0.75,0.85,0.825,0.775,0.875,0.925,0.35,0.0,0.0,0.65,0.7,0.0,0.875],"baseline_reward_curve":[0.0,0.0,0.0,0.525,0.825,0.075,0.825,0.425,0.0,0.5,0.7,0.25,0.9,0.825,0.0,0.45,0.0,0.0,0.75],"gain_curve":[0.2,0.725,0.0,0.29999999999999993,-0.825,-0.075,-0.07499999999999996,0.425,0.825,0.275,0.17500000000000004,0.675,-0.55,-0.825,0.0,0.2,0.7,0.0,0.125],"cost_curve":[0.79928655,0.22908825,0.3906672,0.22179255,0.278121,0.2100426,0.2978049,0.2375481,0.2524374,0.34563975,0.2232645,0.1444194,1.0662,0.311706,0.56742525,0.715269,0.6721302,0.2561946,0.31409745]},{"run_name":"icl-claude-sonnet-4.6","task":"codebase_adaptation","run_index":4,"reward":10.074999999999998,"baseline_reward":7.050000000000001,"reference_reward":19.0,"gain":3.024999999999997,"normalized_reward":0.06544502617801028,"normalized_gain":0.2531380753138073,"cost_usd":8.84900625,"latency_seconds":5.402964,"instance_count":19,"reward_curve":[0.0,0.425,0.0,0.775,0.725,0.825,0.0,0.575,0.0,0.925,0.85,0.85,0.0,0.55,0.675,0.8,0.5,0.75,0.85],"baseline_reward_curve":[0.0,0.0,0.0,0.525,0.825,0.075,0.825,0.425,0.0,0.5,0.7,0.25,0.9,0.825,0.0,0.45,0.0,0.0,0.75],"gain_curve":[0.0,0.425,0.0,0.25,-0.09999999999999998,0.75,-0.825,0.14999999999999997,0.0,0.42500000000000004,0.15000000000000002,0.6,-0.9,-0.2749999999999999,0.675,0.35000000000000003,0.5,0.75,0.09999999999999998],"cost_curve":[0.0611586,0.24956145,0.1728048,0.18617205,0.27102735,0.1832421,0.0949893,0.41972085,0.314328,0.1137813,0.23544465,0.21550815,1.6542867,0.8292114,0.68252805,0.43346475,1.1371194,1.1939067,0.40075065]},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","run_index":0,"reward":0.4512999999999999,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain":-0.12480000000000013,"normalized_reward":-0.2414715897472372,"normalized_gain":-0.04677766366559973,"cost_usd":5.69614695,"latency_seconds":13.035919,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0721,0.2084,0.0344,0.0,0.1364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0005,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.2448,0.0,0.0,0.1811,0.0897,0.0,0.0,0.0],"gain_curve":[0.0,-0.0005,0.0,0.0,0.0,0.0721,0.1484,0.0344,0.0,0.1364,0.0,0.0,-0.2448,0.0,0.0,-0.1811,-0.0897,0.0,0.0,0.0],"cost_curve":[0.47210865,0.2380191,0.1678956,0.17629275,0.37599345,0.42942765,0.253281,0.2033019,0.5082162,0.21688905,0.2165355,0.21999345,0.3788202,0.23314815,0.2338869,0.23872755,0.3649812,0.25200315,0.2560503,0.2605752]},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","run_index":1,"reward":0.0104,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain":-0.5657000000000001,"normalized_reward":-0.4374672155196173,"normalized_gain":-0.2120362526893409,"cost_usd":5.3679132,"latency_seconds":13.44504,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0104],"baseline_reward_curve":[0.0,0.0005,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.2448,0.0,0.0,0.1811,0.0897,0.0,0.0,0.0],"gain_curve":[0.0,-0.0005,0.0,0.0,0.0,0.0,-0.06,0.0,0.0,0.0,0.0,0.0,-0.2448,0.0,0.0,-0.1811,-0.0897,0.0,0.0,0.0104],"cost_curve":[0.3919263,0.20817135,0.2224203,0.21329265,0.37865145,0.21659115,0.2233023,0.2330478,0.3276108,0.2036217,0.2065251,0.2121744,0.3689958,0.22352505,0.2273109,0.23095005,0.34908675,0.30274575,0.30891315,0.31905045]},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","run_index":2,"reward":1.241,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain":0.6649,"normalized_reward":0.10957795816033507,"normalized_gain":0.24921849816712524,"cost_usd":7.0853052,"latency_seconds":22.756767,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.2079,0.0,0.0,0.0,0.1851,0.0,0.0211,0.0,0.0,0.0,0.0,0.0,0.0,0.1798,0.3386,0.3085],"baseline_reward_curve":[0.0,0.0005,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.2448,0.0,0.0,0.1811,0.0897,0.0,0.0,0.0],"gain_curve":[0.0,-0.0005,0.0,0.0,0.2079,0.0,-0.06,0.0,0.1851,0.0,0.0211,0.0,-0.2448,0.0,0.0,-0.1811,-0.0897,0.1798,0.3386,0.3085],"cost_curve":[0.66730755,0.2657949,0.2474961,0.25042575,0.5234667,0.30343665,0.3150435,0.32127105,0.44789325,0.26996385,0.2662854,0.2791398,0.49733145,0.29486685,0.2936529,0.3072954,0.5459244,0.3218766,0.32969535,0.33713775]},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","run_index":3,"reward":0.8401,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain":0.2639999999999999,"normalized_reward":-0.06863625452314695,"normalized_gain":0.09895275006184544,"cost_usd":5.19772275,"latency_seconds":13.929714,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2926,0.197,0.2524,0.0268],"baseline_reward_curve":[0.0,0.0005,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.2448,0.0,0.0,0.1811,0.0897,0.0,0.0,0.0],"gain_curve":[0.0,-0.0005,0.0,0.0,0.0,0.0,-0.06,0.0,0.0713,0.0,0.0,0.0,-0.2448,0.0,0.0,-0.1811,0.20290000000000002,0.197,0.2524,0.0268],"cost_curve":[0.40434615,0.2263824,0.16819185,0.1742295,0.32199165,0.1918548,0.1974924,0.20168415,0.440919,0.2100549,0.2113605,0.21606195,0.38636265,0.22638555,0.23095215,0.23571045,0.3607689,0.2573958,0.26428035,0.27129765]},{"run_name":"icl-claude-sonnet-4.6","task":"cohort_studies","run_index":4,"reward":1.2649000000000001,"baseline_reward":0.5761000000000001,"reference_reward":3.24404,"gain":0.6888000000000001,"normalized_reward":0.12020235248095175,"normalized_gain":0.25817672061590596,"cost_usd":4.7401437,"latency_seconds":16.453593,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0782,0.2845,0.2513,0.2384,0.019,0.0522,0.1984,0.0,0.0,0.0,0.0,0.047,0.0177,0.0782,0.0],"baseline_reward_curve":[0.0,0.0005,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.2448,0.0,0.0,0.1811,0.0897,0.0,0.0,0.0],"gain_curve":[0.0,-0.0005,0.0,0.0,0.0,0.0782,0.22449999999999998,0.2513,0.2384,0.019,0.0522,0.1984,-0.2448,0.0,0.0,-0.1811,-0.0427,0.0177,0.0782,0.0],"cost_curve":[0.3738021,0.18615225,0.1524024,0.1583607,0.2892138,0.1794381,0.18519435,0.19070625,0.30138585,0.2098962,0.21620145,0.2215263,0.33659535,0.2275197,0.23133705,0.2389527,0.290655,0.2443866,0.25017435,0.2562432]},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","run_index":0,"reward":14.666666666666668,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain":8.133333333333336,"normalized_reward":0.26499032882011614,"normalized_gain":0.24302788844621523,"cost_usd":2.1787596,"latency_seconds":2.122906,"instance_count":40,"reward_curve":[0.0,0.0,0.6,0.0,0.9333333333333333,0.8,0.6666666666666667,0.0,0.6666666666666667,0.8666666666666667,0.7333333333333334,0.8666666666666667,0.0,0.8,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.6,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.5333333333333333,0.33333333333333337,0.6666666666666667,0.0,0.6,0.6666666666666667,0.0,0.19999999999999996,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.0,0.6,-0.5333333333333333,0.9333333333333333,0.33333333333333337,0.13333333333333341,-0.33333333333333337,0.0,0.8666666666666667,0.13333333333333341,0.19999999999999996,0.0,0.6000000000000001,0.0,0.9333333333333333,0.0,0.0,0.33333333333333337,0.6,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,-0.5333333333333333,0.33333333333333337,0.0,0.4,0.0],"cost_curve":[0.0487353,0.03210525,0.05486565,0.0361704,0.01477215,0.0338364,0.05225265,0.11798055,0.07244775,0.03464205,0.0583551,0.037854,0.0253467,0.0561192,0.10015575,0.0289866,0.02870625,0.02827215,0.02879865,0.1058613,0.03242565,0.04821555,0.03303615,0.0866301,0.0334725,0.06981345,0.0354822,0.0348492,0.2366715,0.04054095,0.0619902,0.04096965,0.06176805,0.04229565,0.0673923,0.0698064,0.0459357,0.04515675,0.0482934,0.0477504]},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","run_index":1,"reward":16.800000000000004,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain":10.266666666666673,"normalized_reward":0.3268858800773696,"normalized_gain":0.30677290836653404,"cost_usd":1.46627775,"latency_seconds":1.874232,"instance_count":40,"reward_curve":[0.5333333333333333,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.8,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.5333333333333333,0.33333333333333337,0.6666666666666667,0.0,0.6,0.6666666666666667,0.0,0.19999999999999996,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0],"gain_curve":[0.5333333333333333,0.8666666666666667,0.9333333333333333,-0.5333333333333333,0.0,-0.4666666666666667,-0.5333333333333333,0.5333333333333333,-0.6666666666666667,0.0,-0.6,-0.6666666666666667,0.0,0.6000000000000001,0.8666666666666667,0.0,0.9333333333333333,0.0,-0.6,0.0,0.0,0.8666666666666667,0.0,-0.2666666666666667,0.8666666666666667,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.4,-0.6,0.9333333333333333,0.4,0.9333333333333333],"cost_curve":[0.0502014,0.0185724,0.01155,0.03178905,0.01324785,0.0443202,0.04445115,0.02643105,0.06200175,0.0210108,0.01986225,0.01942005,0.0326073,0.0452772,0.0340137,0.0222738,0.02420865,0.02359965,0.02343675,0.0245646,0.10669755,0.04204875,0.02974845,0.04836015,0.0443274,0.0297975,0.04570665,0.094599,0.03379545,0.03637245,0.03527175,0.0344199,0.03555915,0.03535065,0.0368763,0.03657555,0.03647955,0.03598875,0.0378285,0.0376347]},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","run_index":2,"reward":17.333333333333336,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain":10.800000000000004,"normalized_reward":0.3423597678916829,"normalized_gain":0.32270916334661365,"cost_usd":2.20983525,"latency_seconds":2.156902,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.8,0.8,0.8,0.9333333333333333,0.0,0.06666666666666665,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.8,0.8,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.5333333333333333,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.5333333333333333,0.33333333333333337,0.6666666666666667,0.0,0.6,0.6666666666666667,0.0,0.19999999999999996,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.0,0.0,0.2666666666666667,0.8,0.33333333333333337,0.4,-0.33333333333333337,-0.6000000000000001,0.0,0.33333333333333337,0.19999999999999996,0.0,-0.19999999999999996,0.8,0.8,0.9333333333333333,0.9333333333333333,0.33333333333333337,0.5333333333333333,0.0,0.0,0.7333333333333334,-0.2666666666666667,0.0,0.0,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.0,0.4,-0.6,0.0,-0.5333333333333333,0.9333333333333333],"cost_curve":[0.09679365,0.0745941,0.0156924,0.0340539,0.03940755,0.03869145,0.01841715,0.03967065,0.16894965,0.02392095,0.02549295,0.0389142,0.05133855,0.0741837,0.0562527,0.05964615,0.02903385,0.0292539,0.03091365,0.1276017,0.06811815,0.0512694,0.08957745,0.03721635,0.05522895,0.0364878,0.0589347,0.04020615,0.0396009,0.0413337,0.121557,0.0413694,0.0634011,0.04218555,0.0423744,0.0427527,0.0427203,0.13234275,0.0455547,0.044781]},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","run_index":3,"reward":9.600000000000001,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain":3.066666666666669,"normalized_reward":0.11798839458413933,"normalized_gain":0.09163346613545824,"cost_usd":2.22188085,"latency_seconds":2.039522,"instance_count":40,"reward_curve":[0.0,0.7333333333333334,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.0,0.0,0.7333333333333334,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.5333333333333333,0.33333333333333337,0.6666666666666667,0.0,0.6,0.6666666666666667,0.0,0.19999999999999996,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0],"gain_curve":[0.0,0.7333333333333334,0.0,-0.5333333333333333,0.9333333333333333,-0.4666666666666667,0.33333333333333337,-0.33333333333333337,-0.6666666666666667,0.0,0.13333333333333341,0.1333333333333333,0.0,-0.19999999999999996,0.0,0.0,0.0,0.0,-0.6,0.0,0.0,0.0,0.8666666666666667,-0.2666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,-0.5333333333333333,0.33333333333333337,0.9333333333333333,0.4,0.0],"cost_curve":[0.06979035,0.0322524,0.12268815,0.10961955,0.019254,0.0219129,0.03135765,0.02335995,0.0447816,0.0602472,0.0632844,0.05315715,0.0948357,0.0432537,0.06040575,0.06116535,0.0304665,0.06392145,0.03226005,0.03359355,0.08389065,0.06981945,0.0532266,0.0356277,0.03538755,0.05818995,0.15493575,0.0583065,0.08957775,0.0449082,0.0418563,0.04306635,0.04394145,0.04263255,0.04393785,0.065784,0.04743495,0.04507695,0.046545,0.046128]},{"run_name":"icl-claude-sonnet-4.6","task":"database_exploration","run_index":4,"reward":16.66666666666667,"baseline_reward":6.533333333333332,"reference_reward":40.0,"gain":10.13333333333334,"normalized_reward":0.32301740812379126,"normalized_gain":0.3027888446215141,"cost_usd":1.6706001,"latency_seconds":1.933041,"instance_count":40,"reward_curve":[0.6666666666666667,0.6666666666666667,0.0,0.8666666666666667,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0,0.0,0.9333333333333333,0.8666666666666667,0.0,0.7333333333333334,0.0,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.5333333333333333,0.33333333333333337,0.6666666666666667,0.0,0.6,0.6666666666666667,0.0,0.19999999999999996,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0],"gain_curve":[0.6666666666666667,0.6666666666666667,0.0,0.33333333333333337,0.9333333333333333,0.4,-0.5333333333333333,0.6,0.2666666666666666,0.9333333333333333,0.33333333333333337,-0.6666666666666667,0.0,0.7333333333333334,0.0,0.8666666666666667,0.0,0.0,0.33333333333333337,0.8666666666666667,0.0,0.7333333333333334,0.0,-0.2666666666666667,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.4,-0.6,0.9333333333333333,-0.5333333333333333,0.0],"cost_curve":[0.03830175,0.035394,0.038043,0.0193446,0.0125472,0.019551,0.1357638,0.0178365,0.0201819,0.01886445,0.02004735,0.03234015,0.06841275,0.0213021,0.0745278,0.0370548,0.05173365,0.02526765,0.02669415,0.0432621,0.0579837,0.0743946,0.02962305,0.0295743,0.03195945,0.04544325,0.0310374,0.10017975,0.06882,0.0335772,0.03364575,0.03473655,0.0373182,0.0384066,0.03564075,0.03780735,0.05961495,0.03788025,0.05720115,0.03928515]},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","run_index":0,"reward":379.0,"baseline_reward":316.7,"reference_reward":1138.5,"gain":62.30000000000001,"normalized_reward":0.24420340332371382,"normalized_gain":0.07580919931856901,"cost_usd":8.7725622,"latency_seconds":8.086811,"instance_count":120,"reward_curve":[-1.0,-0.5,-2.5,7.0,-0.5,2.5,6.5,-3.0,38.0,-7.0,15.5,-33.5,2.5,7.0,-6.5,-0.5,6.0,-1.0,13.5,37.0,-1.0,-1.0,-0.5,-0.5,1.0,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,7.0,0.5,0.5,6.0,4.0,1.0,80.0,-1.0,-1.0,-0.5,3.0,1.0,-8.0,10.0,-0.5,-0.5,4.0,34.5,2.5,-7.0,-7.0,6.0,15.5,13.5,-5.5,-3.0,-2.5,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,3.0,1.0,1.0,7.0,14.5,0.5,1.0,-7.0,3.0,-0.5,0.5,-0.5,1.0,1.0,0.5,35.5,3.0,3.0,1.0,1.0,15.5,1.0,1.0,0.5,1.0,1.0,-7.0,0.5,0.5,3.0,5.0,4.0,3.0,0.5,5.0,1.0,1.0,1.0,-6.0,0.5,-49.0,-1.0,1.0,-1.0,1.0,5.0,11.0,6.0,-6.0,82.0,3.0,0.5,-6.0],"baseline_reward_curve":[-2.5,5.4,-2.0,7.0,-2.4,2.4,4.0,-3.0,44.0,-15.0,5.2,-12.2,4.5,13.5,-7.0,-12.9,5.4,-5.9,11.4,31.0,2.0,-6.0,-0.5,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-1.0,1.0,-2.0,-3.0,10.0,-0.5,-0.5,-1.0,7.5,2.4,-8.0,-9.0,5.7,15.5,4.5,-4.0,-1.0,-2.4,-0.5,0.5,0.5,-2.2,-0.5,1.0,1.0,-0.5,1.0,-3.5,2.4,14.5,0.5,-4.0,-2.4,4.0,-0.5,0.5,-0.5,1.0,1.0,0.5,11.2,1.0,3.0,-5.0,1.0,4.0,1.0,-2.0,0.5,-2.4,-0.5,-2.5,0.5,0.5,1.0,2.0,4.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,1.0,2.0,-1.0,5.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[1.5,-5.9,-0.5,0.0,1.9,0.10000000000000009,2.5,0.0,-6.0,8.0,10.3,-21.3,-2.0,-6.5,0.5,12.4,0.5999999999999996,4.9,2.0999999999999996,6.0,-3.0,5.0,0.0,-1.5,0.0,0.0,0.0,0.5,0.0,0.5,-2.0,0.0,2.0,1.0,0.0,0.0,8.0,4.0,0.0,-14.0,0.0,1.0,0.5,2.0,3.0,-5.0,0.0,0.0,0.0,5.0,27.0,0.10000000000000009,1.0,2.0,0.2999999999999998,0.0,9.0,-1.5,-2.0,-0.10000000000000009,0.0,0.0,0.0,1.7000000000000002,0.0,0.0,-1.5,3.5,0.0,4.5,4.6,0.0,0.0,5.0,-4.6,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,24.3,2.0,0.0,6.0,0.0,11.5,0.0,3.0,0.0,3.4,1.5,-4.5,0.0,0.0,2.0,3.0,0.0,0.0,0.0,6.0,2.0,0.0,2.0,-8.0,0.0,-38.0,-3.0,0.0,-3.0,2.0,0.0,8.0,7.0,-8.0,-18.0,-0.5,0.0,-2.0],"cost_curve":[0.026478,0.010779,0.04018125,0.04163475,0.011637,0.03955335,0.0445083,0.04482855,0.0481719,0.0588711,0.05869125,0.0654261,0.06537075,0.0662826,0.0705777,0.01769685,0.070722,0.0814275,0.0792162,0.08037435,0.0202287,0.01996515,0.02066025,0.02129055,0.04307325,0.02053815,0.0,0.0213276,0.0,0.0223254,0.11746035,0.0,0.02393355,0.1206831,0.0,0.0,0.0982284,0.07507335,0.02549835,0.1501083,0.07824075,0.02671515,0.0271533,0.0539088,0.0275592,0.14204895,0.11594115,0.028905,0.0283548,0.08874105,0.12318975,0.12484965,0.12853035,0.1316763,0.1429005,0.1414926,0.1419798,0.142326,0.14518035,0.1447356,0.03599715,0.0,0.0,0.03655905,0.03650715,0.03718095,0.0377997,0.0766254,0.0383994,0.0384525,0.1584174,0.1652751,0.0,0.0410304,0.1653768,0.12776505,0.0413703,0.0,0.0415809,0.0418632,0.04208535,0.0,0.17601615,0.08849895,0.0870756,0.04376025,0.044385,0.1807398,0.04549035,0.04567545,0.0,0.0460503,0.0457275,0.18363375,0.0,0.0,0.09438765,0.14095365,0.14271165,0.09657135,0.0,0.1446552,0.04862025,0.14516955,0.04956105,0.1001487,0.0,0.2543331,0.05113635,0.05162385,0.051669,0.05224095,0.15660525,0.2638521,0.2142774,0.1078227,0.4272549,0.1106061,0.0,0.11177055]},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","run_index":1,"reward":314.5,"baseline_reward":316.7,"reference_reward":1138.5,"gain":-2.1999999999999886,"normalized_reward":0.18001791223007266,"normalized_gain":-0.0026770503772207215,"cost_usd":8.90220705,"latency_seconds":8.138083,"instance_count":120,"reward_curve":[34.5,8.0,1.0,-1.0,8.0,3.5,-2.4,-1.0,11.4,12.1,-3.5,34.3,2.4,-2.4,5.4,-11.9,-3.5,-2.3,-6.8,-0.5,-1.0,-0.5,2.0,1.0,-0.5,0.5,94.0,-7.0,2.0,0.5,-1.0,-0.5,0.5,-0.5,-0.5,0.5,-1.0,-1.0,-0.5,0.5,-1.0,-0.5,-6.0,-0.5,-0.5,94.0,-0.5,-1.0,-0.5,1.0,-6.7,14.6,-4.8,-7.7,-7.7,5.1,-0.5,2.3,11.3,-2.3,6.1,-0.5,-5.1,7.7,1.0,-0.5,0.5,0.5,-5.1,3.0,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,3.5,-0.5,-0.5,0.5,0.5,1.0,0.5,-0.5,3.0,15.2,-1.0,-0.5,1.0,0.5,1.0,1.0,1.0,6.8,-4.0,0.5,-0.5,2.0,1.0,-95.0,-0.5,-0.5,0.5,-1.0,5.0,-4.0,-1.0,0.5,3.5,-1.0,-1.0,96.0,-0.5,4.0,0.5,3.0,3.0,3.0,1.0],"baseline_reward_curve":[-2.5,5.4,-2.0,7.0,-2.4,2.4,4.0,-3.0,44.0,-15.0,5.2,-12.2,4.5,13.5,-7.0,-12.9,5.4,-5.9,11.4,31.0,2.0,-6.0,-0.5,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-1.0,1.0,-2.0,-3.0,10.0,-0.5,-0.5,-1.0,7.5,2.4,-8.0,-9.0,5.7,15.5,4.5,-4.0,-1.0,-2.4,-0.5,0.5,0.5,-2.2,-0.5,1.0,1.0,-0.5,1.0,-3.5,2.4,14.5,0.5,-4.0,-2.4,4.0,-0.5,0.5,-0.5,1.0,1.0,0.5,11.2,1.0,3.0,-5.0,1.0,4.0,1.0,-2.0,0.5,-2.4,-0.5,-2.5,0.5,0.5,1.0,2.0,4.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,1.0,2.0,-1.0,5.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[37.0,2.5999999999999996,3.0,-8.0,10.4,1.1,-6.4,2.0,-32.6,27.1,-8.7,46.5,-2.1,-15.9,12.4,1.0,-8.9,3.6000000000000005,-18.2,-31.5,-3.0,5.5,2.5,0.0,-1.5,1.0,93.5,-6.0,1.5,1.5,-3.0,-1.0,1.5,-6.5,-1.0,0.0,1.0,-1.0,-1.5,-93.5,0.0,1.5,-5.0,-1.5,1.5,97.0,-10.5,-0.5,0.0,2.0,-14.2,12.2,3.2,1.2999999999999998,-13.4,-10.4,-5.0,6.3,12.3,0.10000000000000009,6.6,-1.0,-5.6,9.9,1.5,-1.5,-0.5,1.0,-6.1,6.5,-1.9,-15.0,-1.0,3.5,1.9,-4.5,1.5,3.0,0.0,-1.5,-0.5,0.0,-10.2,-0.5,-3.5,8.0,14.2,-5.0,-1.5,3.0,0.0,3.4,1.5,3.5,6.3,-4.5,-0.5,-2.5,-2.0,-2.0,-95.5,0.5,0.5,-0.5,0.0,3.0,-4.5,10.0,-1.5,2.5,-3.0,0.0,91.0,-3.5,5.0,-1.5,-97.0,-0.5,2.5,5.0],"cost_curve":[0.030567,0.0447039,0.0370206,0.0406026,0.04511145,0.0496662,0.0549072,0.05231895,0.05991225,0.06250905,0.06237855,0.0661497,0.0722829,0.07337565,0.0784428,0.08703855,0.08124075,0.0879621,0.0929619,0.02193795,0.021897,0.0213069,0.06903045,0.0468006,0.022215,0.0,0.11366685,0.12716475,0.10631475,0.0,0.02580975,0.02490135,0.0,0.02559825,0.024987,0.0,0.0780831,0.05484465,0.0275133,0.0,0.0555018,0.0255987,0.11746155,0.0280809,0.0280035,0.13978425,0.030201,0.0301953,0.02875815,0.0302586,0.12551085,0.13178835,0.13106565,0.13526595,0.13879455,0.14660205,0.034932,0.14118135,0.1495581,0.1459155,0.150726,0.0373464,0.1559424,0.15940785,0.08051415,0.03891825,0.0,0.0,0.16545345,0.08302425,0.0,0.0405129,0.04005,0.0401247,0.03995115,0.0402861,0.04090125,0.1301208,0.0437094,0.04221105,0.0,0.0,0.0432588,0.0,0.04244325,0.08650455,0.18182745,0.1781574,0.04427955,0.09141285,0.0,0.04568055,0.04558995,0.04626855,0.19049415,0.293211,0.0,0.0486183,0.0981303,0.0496311,0.28877955,0.0487329,0.05044095,0.0,0.04950405,0.15387495,0.20767425,0.2047068,0.0,0.1063179,0.0517152,0.0513738,0.3649614,0.0540189,0.16506285,0.0,0.11093295,0.11165025,0.22545825,0.0566124]},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","run_index":2,"reward":323.79999999999995,"baseline_reward":316.7,"reference_reward":1138.5,"gain":7.099999999999966,"normalized_reward":0.18927256443427204,"normalized_gain":0.008639571671939603,"cost_usd":10.97648895,"latency_seconds":9.403691,"instance_count":120,"reward_curve":[34.0,-6.0,-5.1,5.1,-1.0,2.3,-6.3,-6.8,-11.1,-0.5,31.5,-1.0,-1.0,11.6,5.8,4.9,3.0,-2.2,31.0,14.0,2.0,-0.5,1.0,-3.0,-0.5,0.5,1.0,-2.0,0.5,0.5,-1.0,1.0,1.0,-1.0,-1.0,0.5,-0.5,-1.0,-2.0,1.0,-0.5,94.0,-0.5,9.4,-0.5,0.5,2.0,-1.0,-0.5,-1.0,9.9,2.3,-5.8,-5.1,-1.0,5.1,15.5,-3.0,10.9,-5.7,0.5,7.0,0.5,-0.5,0.5,1.0,-0.5,-2.3,-0.5,-0.5,1.0,17.3,-0.5,-2.3,-0.5,3.0,-1.0,1.0,-1.0,-0.5,1.0,1.0,0.5,1.0,15.3,0.5,0.5,-2.3,-14.0,-3.0,-0.5,-5.1,-0.5,0.5,3.0,-1.0,3.0,-1.0,3.0,-1.0,0.5,3.0,-17.5,-1.0,-1.0,1.0,-1.0,0.5,-1.0,96.0,-1.0,1.0,-0.5,-1.0,-0.5,0.5,-1.0,0.5,3.0,-3.0],"baseline_reward_curve":[-2.5,5.4,-2.0,7.0,-2.4,2.4,4.0,-3.0,44.0,-15.0,5.2,-12.2,4.5,13.5,-7.0,-12.9,5.4,-5.9,11.4,31.0,2.0,-6.0,-0.5,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-1.0,1.0,-2.0,-3.0,10.0,-0.5,-0.5,-1.0,7.5,2.4,-8.0,-9.0,5.7,15.5,4.5,-4.0,-1.0,-2.4,-0.5,0.5,0.5,-2.2,-0.5,1.0,1.0,-0.5,1.0,-3.5,2.4,14.5,0.5,-4.0,-2.4,4.0,-0.5,0.5,-0.5,1.0,1.0,0.5,11.2,1.0,3.0,-5.0,1.0,4.0,1.0,-2.0,0.5,-2.4,-0.5,-2.5,0.5,0.5,1.0,2.0,4.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,1.0,2.0,-1.0,5.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[36.5,-11.4,-3.0999999999999996,-1.9000000000000004,1.4,-0.10000000000000009,-10.3,-3.8,-55.1,14.5,26.3,11.2,-5.5,-1.9000000000000004,12.8,17.8,-2.4000000000000004,3.7,19.6,-17.0,0.0,5.5,1.5,-4.0,-1.5,1.0,0.5,-1.0,0.0,1.5,-3.0,0.5,2.0,-7.0,-1.5,0.0,1.5,-1.0,-3.0,-93.0,0.5,96.0,0.5,8.4,1.5,3.5,-8.0,-0.5,0.0,0.0,2.4000000000000004,-0.10000000000000009,2.2,3.9000000000000004,-6.7,-10.4,11.0,1.0,11.9,-3.3000000000000003,1.0,6.5,0.0,1.7000000000000002,1.0,0.0,-1.5,-1.7999999999999998,-1.5,3.0,-1.4,2.8000000000000007,-1.0,1.7000000000000002,1.9,-1.0,-0.5,0.5,-0.5,-1.5,0.0,0.5,-10.7,0.0,12.3,5.5,-0.5,-6.3,-15.0,-1.0,-1.0,-2.6999999999999997,0.0,3.0,2.5,-1.5,2.0,-3.0,-1.0,-4.0,0.0,4.0,-16.5,-2.0,0.0,-1.0,-1.5,11.5,-3.0,95.0,-3.0,2.0,-5.5,-4.0,0.5,-1.5,-101.0,-3.0,2.5,1.0],"cost_curve":[0.042147,0.05021625,0.04934415,0.0517059,0.05461545,0.05957055,0.0623613,0.0692466,0.0682602,0.0187287,0.07470855,0.075849,0.07927365,0.0825285,0.08877315,0.08813985,0.0964362,0.10508475,0.0989208,0.1051998,0.1070028,0.0264105,0.0272646,0.0557202,0.0284739,0.0,0.0569853,0.08602695,0.0,0.0,0.0301782,0.05885655,0.05986485,0.02988495,0.08912565,0.0,0.0306963,0.06392235,0.0953967,0.06624015,0.0325113,0.15342135,0.03316035,0.1738407,0.0351594,0.0,0.14165685,0.036006,0.0353151,0.0741783,0.15200325,0.150609,0.1549551,0.15706905,0.15559965,0.16199505,0.1665027,0.17148795,0.1750665,0.177249,0.0,0.1779834,0.0,0.04574505,0.0,0.0923871,0.0461925,0.18473055,0.0468411,0.04649205,0.04787595,0.1942137,0.0500577,0.19375485,0.04932405,0.0985458,0.2004102,0.05122095,0.2029647,0.052314,0.0525801,0.1054593,0.0,0.0527541,0.2158677,0.0,0.0,0.2163861,0.2213208,0.22505715,0.05659335,0.22926255,0.05686755,0.0,0.1151559,0.1177203,0.1174182,0.05856975,0.1175718,0.05924505,0.0,0.30007785,0.3712899,0.1260126,0.06247035,0.1256145,0.0632805,0.0,0.1248297,0.42424485,0.1261551,0.19037685,0.0648666,0.12997425,0.06565185,0.0,0.06642705,0.0,0.13264635,0.1327404]},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","run_index":3,"reward":290.19999999999993,"baseline_reward":316.7,"reference_reward":1138.5,"gain":-26.500000000000057,"normalized_reward":0.15583640163200313,"normalized_gain":-0.03224628863470438,"cost_usd":8.9346669,"latency_seconds":8.188208,"instance_count":120,"reward_curve":[14.0,8.0,12.7,28.8,-5.5,-5.8,8.0,-3.5,-28.8,5.8,-8.5,2.3,-3.5,2.5,-2.3,-1.0,29.3,-2.3,8.0,33.8,-0.5,-0.5,-1.0,-0.5,2.0,-1.0,-1.0,-0.5,-1.0,-2.5,-1.0,0.5,0.5,1.0,0.5,-0.5,-1.0,-1.0,-0.5,9.0,31.0,-0.5,-2.0,-0.5,0.5,-0.5,1.0,-2.0,0.5,-0.5,-8.0,-3.5,17.5,12.8,5.3,5.7,-8.0,-8.0,27.5,-1.0,1.0,2.5,0.5,17.5,2.5,1.0,-2.3,-0.5,-0.5,0.5,1.0,1.0,0.5,-2.3,1.0,-2.3,-0.5,-0.5,0.5,-0.5,13.6,0.5,0.5,-8.0,-2.3,-0.5,1.0,1.0,0.5,-2.3,-0.5,-0.5,8.3,1.0,3.5,-10.0,100.0,-1.0,0.5,0.5,-1.0,3.5,-2.5,0.5,1.0,2.0,2.5,-1.0,3.0,-0.5,0.5,-1.0,-1.0,-0.5,1.0,1.0,-1.0,-2.5,-1.0,1.0],"baseline_reward_curve":[-2.5,5.4,-2.0,7.0,-2.4,2.4,4.0,-3.0,44.0,-15.0,5.2,-12.2,4.5,13.5,-7.0,-12.9,5.4,-5.9,11.4,31.0,2.0,-6.0,-0.5,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-1.0,1.0,-2.0,-3.0,10.0,-0.5,-0.5,-1.0,7.5,2.4,-8.0,-9.0,5.7,15.5,4.5,-4.0,-1.0,-2.4,-0.5,0.5,0.5,-2.2,-0.5,1.0,1.0,-0.5,1.0,-3.5,2.4,14.5,0.5,-4.0,-2.4,4.0,-0.5,0.5,-0.5,1.0,1.0,0.5,11.2,1.0,3.0,-5.0,1.0,4.0,1.0,-2.0,0.5,-2.4,-0.5,-2.5,0.5,0.5,1.0,2.0,4.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,1.0,2.0,-1.0,5.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[16.5,2.5999999999999996,14.7,21.8,-3.1,-8.2,4.0,-0.5,-72.8,20.8,-13.7,14.5,-8.0,-11.0,4.7,11.9,23.9,3.6000000000000005,-3.4000000000000004,2.799999999999997,-2.5,5.5,-0.5,-1.5,1.0,-0.5,-1.5,0.5,-1.5,-1.5,-3.0,0.0,1.5,-5.0,0.0,-1.0,1.0,-1.0,-1.5,-85.0,32.0,1.5,-1.0,-1.5,2.5,2.5,-9.0,-1.5,1.0,0.5,-15.5,-5.9,25.5,21.8,-0.40000000000000036,-9.8,-12.5,-4.0,28.5,1.4,1.5,2.0,0.0,19.7,3.0,0.0,-3.3,0.0,-1.5,4.0,-1.4,-13.5,0.0,1.7000000000000002,3.4,-6.3,0.0,-1.0,1.0,-1.5,12.6,0.0,-10.7,-9.0,-5.3,4.5,0.0,-3.0,-0.5,-0.2999999999999998,-1.0,1.9,8.8,3.5,3.0,-10.5,99.0,-3.0,-3.5,-2.5,-1.5,4.5,-1.5,-0.5,2.0,0.0,2.0,10.0,1.0,-1.5,-1.5,0.0,-6.0,-3.5,2.0,-1.0,-101.0,-6.0,-1.5,5.0],"cost_curve":[0.032028,0.04029735,0.03949725,0.04670805,0.0503346,0.05111385,0.0533886,0.05404605,0.06013065,0.06077955,0.06094185,0.0684243,0.07000125,0.06916515,0.0750795,0.0752028,0.07423815,0.0794196,0.0841479,0.08634285,0.0208611,0.02082,0.0619347,0.0222318,0.0930732,0.0232587,0.02279865,0.0235167,0.0499896,0.0493272,0.05047545,0.0,0.0,0.0507369,0.0,0.0248763,0.05209065,0.026013,0.026154,0.1394982,0.1411116,0.0276087,0.0850209,0.02799405,0.0,0.0294264,0.05935755,0.0869433,0.0,0.03090975,0.12539535,0.1267413,0.13852155,0.13619805,0.13639665,0.1393449,0.14225145,0.1447587,0.15190125,0.15218025,0.0750159,0.073962,0.0,0.1577079,0.078195,0.07827885,0.1607607,0.0410856,0.0401097,0.0,0.04071285,0.04030455,0.0,0.1668405,0.0419964,0.17035005,0.0431904,0.04239885,0.0,0.04308285,0.17529675,0.0,0.0,0.17930235,0.1785333,0.0454494,0.09359775,0.0462591,0.0,0.1890288,0.0472119,0.04674045,0.1942164,0.0974976,0.146994,0.30090375,0.2024181,0.0507792,0.0,0.0,0.0509592,0.1026795,0.104232,0.0,0.10480035,0.15671475,0.10506375,0.1067208,0.2695995,0.0545712,0.0,0.05492835,0.1085802,0.05458905,0.05614605,0.1647822,0.05531415,0.1124634,0.0565752,0.05672115]},{"run_name":"icl-claude-sonnet-4.6","task":"exploitable_poker","run_index":4,"reward":392.1,"baseline_reward":316.7,"reference_reward":1138.5,"gain":75.40000000000003,"normalized_reward":0.257239526321027,"normalized_gain":0.09174981747383797,"cost_usd":9.33971595,"latency_seconds":8.347449,"instance_count":120,"reward_curve":[7.0,-0.5,7.0,38.0,-3.5,6.0,3.5,16.0,2.5,-7.0,-35.0,-3.0,37.5,67.5,-3.0,-1.0,-3.5,-0.5,7.0,-1.0,1.0,0.5,10.0,1.0,-0.5,2.0,-1.0,-1.0,-3.0,99.0,-3.0,-0.5,0.5,-0.5,-1.0,-3.0,1.0,2.0,-3.0,-0.5,0.5,-0.5,-0.5,0.5,0.5,2.0,-1.0,-1.0,-1.0,-0.5,-3.5,-8.0,-3.5,-2.5,15.0,2.5,-1.0,34.5,15.0,5.5,1.0,16.0,1.0,0.5,-0.5,-2.5,-2.5,-1.0,1.0,0.5,-15.5,-3.0,1.0,-5.7,16.3,-0.5,-2.5,0.5,0.5,-0.5,0.5,-0.5,1.0,1.0,-3.0,0.5,3.0,-0.5,7.0,1.0,0.5,-3.0,1.0,-0.5,-0.5,3.0,-1.0,1.0,-1.0,-1.0,0.5,4.0,-1.0,-1.0,3.0,-1.0,100.0,0.5,0.5,-1.0,2.0,3.0,3.0,3.0,0.5,-1.0,1.0,-22.0,-1.0,-0.5],"baseline_reward_curve":[-2.5,5.4,-2.0,7.0,-2.4,2.4,4.0,-3.0,44.0,-15.0,5.2,-12.2,4.5,13.5,-7.0,-12.9,5.4,-5.9,11.4,31.0,2.0,-6.0,-0.5,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-1.0,1.0,-2.0,-3.0,10.0,-0.5,-0.5,-1.0,7.5,2.4,-8.0,-9.0,5.7,15.5,4.5,-4.0,-1.0,-2.4,-0.5,0.5,0.5,-2.2,-0.5,1.0,1.0,-0.5,1.0,-3.5,2.4,14.5,0.5,-4.0,-2.4,4.0,-0.5,0.5,-0.5,1.0,1.0,0.5,11.2,1.0,3.0,-5.0,1.0,4.0,1.0,-2.0,0.5,-2.4,-0.5,-2.5,0.5,0.5,1.0,2.0,4.0,3.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,1.0,2.0,-1.0,5.0,3.0,-1.0,2.0,100.0,3.5,0.5,-4.0],"gain_curve":[9.5,-5.9,9.0,31.0,-1.1,3.6,-0.5,19.0,-41.5,8.0,-40.2,9.2,33.0,54.0,4.0,11.9,-8.9,5.4,-4.4,-32.0,-1.0,6.5,10.5,0.0,-1.5,2.5,-1.5,0.0,-3.5,100.0,-5.0,-1.0,1.5,-6.5,-1.5,-3.5,3.0,2.0,-4.0,-94.5,1.5,1.5,0.5,-0.5,2.5,5.0,-11.0,-0.5,-0.5,0.5,-11.0,-10.4,4.5,6.5,9.3,-13.0,-5.5,38.5,16.0,7.9,1.5,15.5,0.5,2.7,0.0,-3.5,-3.5,-0.5,0.0,4.0,-17.9,-17.5,0.5,-1.7000000000000002,18.7,-4.5,-2.0,0.0,1.0,-1.5,-0.5,-1.0,-10.2,0.0,-6.0,5.5,2.0,-4.5,6.0,3.0,0.0,-0.6000000000000001,1.5,2.0,-1.0,2.5,-2.0,-1.0,-5.0,-4.0,0.0,5.0,0.0,-2.0,4.0,-3.0,99.5,11.5,-1.5,-2.0,0.0,4.0,-2.0,0.0,1.5,-3.0,-99.0,-25.5,-1.5,3.5],"cost_curve":[0.031995,0.013632,0.03847185,0.03889485,0.041763,0.04346385,0.04643925,0.05240115,0.0543348,0.0612381,0.061206,0.06186795,0.0716904,0.06865335,0.07197705,0.07525695,0.074682,0.02098785,0.0807015,0.08338005,0.04102035,0.0,0.11329515,0.04593915,0.0228765,0.070461,0.0234858,0.0239742,0.12067335,0.1222095,0.052134,0.024033,0.0,0.0241389,0.0503067,0.0536097,0.05461335,0.08304945,0.08101935,0.02686845,0.0,0.02621475,0.02753805,0.0,0.0,0.08509785,0.05770875,0.05690025,0.0598833,0.0290604,0.12038115,0.12583725,0.1280922,0.13156425,0.13253775,0.13583685,0.1357635,0.1441752,0.1555434,0.1510101,0.07498275,0.15360885,0.0391449,0.0,0.0380763,0.15420525,0.1614837,0.16174515,0.04095135,0.0,0.1661826,0.1691217,0.1736256,0.19062525,0.1793166,0.04504965,0.1803561,0.0,0.0,0.04660365,0.0,0.0450891,0.0450048,0.04584465,0.1849713,0.0,0.0940707,0.0468036,0.19765125,0.0491967,0.0,0.1965363,0.0997698,0.04909695,0.048789,0.10055445,0.10035345,0.10179615,0.10260225,0.1019076,0.0,0.15819045,0.1066755,0.05159655,0.26973735,0.05283435,0.210759,0.0,0.0,0.108507,0.10989765,0.11004855,0.1112682,0.1129584,0.0,0.0561306,0.05717475,0.2915373,0.0589314,0.0584616]},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","run_index":0,"reward":10.023900000000001,"baseline_reward":5.1783,"reference_reward":12.0,"gain":4.845600000000001,"normalized_reward":0.6913645805675732,"normalized_gain":0.7103214741193545,"cost_usd":3.47168925,"latency_seconds":17.212394,"instance_count":12,"reward_curve":[0.6795,0.7207,0.7599,0.8323,0.8471,0.8728,0.867,0.9047,0.8503,0.8843,0.9,0.9053],"baseline_reward_curve":[0.6807,0.3798,0.488,0.6244,0.2397,0.4144,0.7293,0.2857,0.3494,0.3821,0.2823,0.3225],"gain_curve":[-0.0011999999999999789,0.3409,0.27190000000000003,0.20790000000000008,0.6073999999999999,0.45840000000000003,0.13770000000000004,0.619,0.5008999999999999,0.5022,0.6177,0.5828],"cost_curve":[0.2674287,0.2782932,0.3000525,0.35000715,0.31129395,0.31491495,0.2487168,0.26466795,0.27210375,0.2826567,0.2856654,0.2958882]},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","run_index":1,"reward":9.465,"baseline_reward":5.1783,"reference_reward":12.0,"gain":4.2867,"normalized_reward":0.6040732815843316,"normalized_gain":0.6283917498570737,"cost_usd":2.81698875,"latency_seconds":15.788846,"instance_count":12,"reward_curve":[0.7069,0.7733,0.8016,0.7934,0.7661,0.8085,0.8061,0.8216,0.7783,0.7714,0.8173,0.8205],"baseline_reward_curve":[0.6807,0.3798,0.488,0.6244,0.2397,0.4144,0.7293,0.2857,0.3494,0.3821,0.2823,0.3225],"gain_curve":[0.0262,0.39349999999999996,0.3136,0.16900000000000004,0.5264,0.3941,0.07680000000000009,0.5359,0.4289,0.3893,0.535,0.498],"cost_curve":[0.2500569,0.23418345,0.21703005,0.2418261,0.19167975,0.20597865,0.21713775,0.2292111,0.2426541,0.2572212,0.26149755,0.26851215]},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","run_index":2,"reward":9.779799999999998,"baseline_reward":5.1783,"reference_reward":12.0,"gain":4.601499999999998,"normalized_reward":0.6532400393583954,"normalized_gain":0.6745386047466171,"cost_usd":2.8448082,"latency_seconds":12.50848,"instance_count":12,"reward_curve":[0.6412,0.7038,0.8438,0.7965,0.8238,0.8346,0.8369,0.8725,0.855,0.8515,0.8662,0.854],"baseline_reward_curve":[0.6807,0.3798,0.488,0.6244,0.2397,0.4144,0.7293,0.2857,0.3494,0.3821,0.2823,0.3225],"gain_curve":[-0.03949999999999998,0.32399999999999995,0.3558,0.17210000000000003,0.5841,0.4202,0.10760000000000003,0.5868,0.5056,0.46940000000000004,0.5839,0.5315],"cost_curve":[0.22735365,0.27168345,0.20663565,0.2073129,0.2442225,0.22655595,0.23297655,0.2477415,0.26002485,0.2303592,0.2398614,0.2500806]},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","run_index":3,"reward":10.052100000000001,"baseline_reward":5.1783,"reference_reward":12.0,"gain":4.873800000000001,"normalized_reward":0.6957689724647417,"normalized_gain":0.7144553410440214,"cost_usd":2.8396476,"latency_seconds":18.697372,"instance_count":12,"reward_curve":[0.0286,0.6464,0.8896,0.9199,0.85,0.9564,0.9694,0.9465,0.9422,0.9634,0.9741,0.9656],"baseline_reward_curve":[0.6807,0.3798,0.488,0.6244,0.2397,0.4144,0.7293,0.2857,0.3494,0.3821,0.2823,0.3225],"gain_curve":[-0.6521,0.26659999999999995,0.40159999999999996,0.2955000000000001,0.6103,0.542,0.2401000000000001,0.6608,0.5928,0.5813,0.6918,0.6431],"cost_curve":[0.23584575,0.2537925,0.23201205,0.2006637,0.1909605,0.21075225,0.21915285,0.2325996,0.24439515,0.2588412,0.2731344,0.28749765]},{"run_name":"icl-claude-sonnet-4.6","task":"sales_prediction","run_index":4,"reward":10.769200000000001,"baseline_reward":5.1783,"reference_reward":12.0,"gain":5.590900000000001,"normalized_reward":0.8077685976228781,"normalized_gain":0.8195757655716319,"cost_usd":2.8243458,"latency_seconds":14.139589,"instance_count":12,"reward_curve":[0.717,0.7808,0.9324,0.927,0.8532,0.92,0.9319,0.9219,0.9225,0.9394,0.9654,0.9577],"baseline_reward_curve":[0.6807,0.3798,0.488,0.6244,0.2397,0.4144,0.7293,0.2857,0.3494,0.3821,0.2823,0.3225],"gain_curve":[0.0363,0.401,0.4444,0.3026000000000001,0.6134999999999999,0.5056,0.2026,0.6362000000000001,0.5730999999999999,0.5573,0.6831,0.6352],"cost_curve":[0.2945061,0.2867004,0.2543715,0.19615185,0.1863744,0.19187745,0.20120655,0.21402465,0.22970025,0.24229905,0.256506,0.2706276]},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","run_index":0,"reward":35.37060000000002,"baseline_reward":19.7597,"reference_reward":90.0,"gain":15.610900000000019,"normalized_reward":0.22223551018664864,"normalized_gain":0.22224990496908495,"cost_usd":0.7334373,"latency_seconds":4.418505,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3669,0.3599,0.3715,0.3257,0.2831,0.3398,0.3406,0.3804,0.3609,0.3416,0.4461,0.3541,0.4166,0.4199,0.3771,0.3566,0.4344,0.4391,0.4395,0.4352,0.4395,0.4943,0.4859,0.4799,0.4342,0.3514,0.332,0.2905,0.2936,0.2763,0.3299,0.2817,0.3303,0.3812,0.3755,0.3808,0.4474,0.4364,0.4195,0.4127,0.4199,0.3595,0.3392,0.3497,0.3782,0.4172,0.4022,0.403,0.3184,0.3251,0.3427,0.3414,0.3429,0.3566,0.3517,0.3393,0.3542,0.3315,0.3514,0.4572,0.4186,0.4284,0.4138,0.418,0.5144,0.4796,0.483,0.5147,0.5524,0.504,0.4806,0.4513,0.4283,0.5328,0.5016,0.5622,0.4766,0.4883,0.4982,0.3648,0.3709,0.4046,0.4018,0.314,0.3163,0.329,0.3235],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1405,0.1358,0.1587,0.09839999999999999,0.08810000000000001,0.1177,0.128,0.14,0.1324,0.12230000000000002,0.1978,0.16210000000000002,0.21920000000000003,0.196,0.15009999999999998,0.15009999999999998,0.187,0.23729999999999998,0.2376,0.22219999999999998,0.2312,0.26990000000000003,0.2526,0.27049999999999996,0.22369999999999998,0.1202,0.12480000000000002,0.0923,0.08510000000000004,0.0668,0.12720000000000004,0.0582,0.11639999999999998,0.1783,0.1341,0.18350000000000002,0.22710000000000002,0.21000000000000002,0.2269,0.17300000000000001,0.1983,0.13219999999999998,0.11180000000000001,0.1282,0.1473,0.1839,0.17350000000000002,0.18530000000000002,0.09690000000000001,0.11760000000000001,0.13,0.11679999999999999,0.11769999999999997,0.15679999999999997,0.11560000000000001,0.14379999999999998,0.1386,0.08960000000000001,0.13999999999999999,0.2406,0.19760000000000003,0.2303,0.1983,0.19079999999999997,0.2592,0.27080000000000004,0.2618,0.26060000000000005,0.3389,0.25680000000000003,0.2503,0.23049999999999998,0.19060000000000002,0.2906000000000001,0.28870000000000007,0.3134,0.27690000000000003,0.2804,0.28059999999999996,0.14820000000000003,0.1608,0.18530000000000002,0.2014,0.1144,0.11460000000000004,0.08480000000000001,0.1013],"cost_curve":[0.0009075,0.00181,0.002757,0.0037075,0.0038355,0.00293655,0.0030427,0.00539,0.0026412,0.00336135,0.00466995,0.00469905,0.0053036,0.0058247,0.00700525,0.00389125,0.00458885,0.005612,0.00627905,0.00665765,0.0076132,0.0043299,0.00496395,0.00585455,0.00712615,0.00782575,0.0097228,0.00550525,0.00673335,0.0071675,0.00870665,0.0094082,0.0056358,0.00608485,0.0070724,0.0074539,0.00823295,0.0052695,0.0280495,0.00700405,0.00808115,0.030491,0.00921925,0.032024,0.0066105,0.007655,0.00863,0.00909305,0.0070252,0.0063607,0.0070132,0.0076672,0.0085557,0.0090967,0.0060733,0.0066919,0.0083429,0.00821195,0.00928045,0.00932495,0.00639715,0.0079341,0.0082631,0.00868115,0.00999915,0.0101797,0.00754085,0.0075859,0.0083369,0.0090089,0.0096874,0.010649,0.0083957,0.00895625,0.0100708,0.01048785,0.0104768,0.0112998,0.008742,0.0092485,0.01026605,0.0106551,0.01121665,0.00803275,0.0088388,0.0099028,0.0101529,0.01112485,0.01183085,0.0093466]},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","run_index":1,"reward":32.26659999999994,"baseline_reward":19.7597,"reference_reward":90.0,"gain":12.506899999999941,"normalized_reward":0.1780435370663013,"normalized_gain":0.17805874974907482,"cost_usd":0.6725014,"latency_seconds":4.175026,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2478,0.2669,0.2669,0.2669,0.2669,0.3196,0.3196,0.3318,0.4085,0.3943,0.3943,0.3943,0.3943,0.3943,0.3943,0.3943,0.3638,0.3638,0.3638,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665,0.3665],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03609999999999999,0.040500000000000036,0.04280000000000003,0.05410000000000004,0.039600000000000024,0.12459999999999999,0.0975,0.11919999999999997,0.16809999999999997,0.16579999999999998,0.175,0.146,0.20229999999999998,0.1969,0.1704,0.16729999999999998,0.15730000000000002,0.1164,0.162,0.1646,0.1535,0.15819999999999998,0.1421,0.13319999999999999,0.1571,0.156,0.1353,0.1593,0.1683,0.158,0.157,0.1638,0.143,0.15259999999999999,0.1636,0.1251,0.1692,0.1462,0.1401,0.1739,0.1268,0.1449,0.1392,0.1391,0.145,0.1356,0.13319999999999999,0.1378,0.1488,0.145,0.159,0.1538,0.1419,0.14129999999999998,0.1667,0.1304,0.17099999999999999,0.15089999999999998,0.12459999999999999,0.1551,0.1499,0.1455,0.1684,0.151,0.13929999999999998,0.11130000000000001,0.15769999999999998,0.14529999999999998,0.1124,0.153,0.11929999999999999,0.1362,0.1457,0.1288,0.1243,0.1536,0.1177,0.1668,0.1586,0.1489,0.1499,0.15639999999999998,0.1472,0.1661,0.1669,0.1648,0.12229999999999999,0.14429999999999998],"cost_curve":[0.0014545,0.002407,0.0031035,0.00260165,0.00270155,0.00276715,0.00495,0.0063375,0.0028023,0.00352095,0.00413705,0.00517465,0.0057752,0.00608925,0.00679635,0.00384485,0.00451145,0.006087,0.0062881,0.0066692,0.00735425,0.00457905,0.00527965,0.0062697,0.0066808,0.00735235,0.0044252,0.00505975,0.0057648,0.0064704,0.00720645,0.007907,0.00526945,0.0059355,0.00627905,0.0069816,0.00800565,0.0087092,0.00552975,0.0063694,0.00723045,0.00832505,0.0087001,0.0061607,0.00657375,0.0072578,0.00797385,0.0086889,0.00978245,0.0065606,0.00761865,0.0083357,0.0090177,0.00942925,0.00654995,0.00726395,0.0080175,0.0091141,0.00955815,0.0102772,0.00736385,0.0081184,0.0089425,0.01000255,0.0104136,0.0074993,0.0086689,0.00911445,0.009902,0.0107536,0.00783835,0.00855335,0.0093079,0.0101615,0.01125505,0.0116641,0.00875235,0.0099194,0.01032995,0.01104945,0.0117975,0.00888525,0.0096383,0.01039535,0.0111494,0.01206295,0.00916925,0.00986925,0.0106363,0.0114033]},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","run_index":2,"reward":31.50939999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":11.74969999999999,"normalized_reward":0.16726320135537215,"normalized_gain":0.16727861355945217,"cost_usd":0.67885765,"latency_seconds":3.640166,"instance_count":90,"reward_curve":[0.2482,0.2752,0.311,0.2622,0.3032,0.288,0.3495,0.3256,0.3446,0.3848,0.4097,0.4124,0.431,0.4145,0.4056,0.4338,0.403,0.401,0.3508,0.3511,0.3509,0.3095,0.3117,0.293,0.2876,0.3177,0.298,0.2961,0.3015,0.3234,0.3385,0.3492,0.3435,0.3317,0.28,0.3045,0.3738,0.3607,0.3477,0.3677,0.3462,0.3243,0.3479,0.3268,0.3336,0.3568,0.3006,0.3249,0.3125,0.3082,0.36,0.3856,0.3504,0.3449,0.3026,0.2913,0.3051,0.3294,0.3644,0.3826,0.3626,0.3662,0.3751,0.4216,0.4004,0.3643,0.3891,0.3815,0.4226,0.4652,0.3153,0.2931,0.3794,0.3654,0.3548,0.3802,0.366,0.3737,0.3244,0.3299,0.2967,0.4806,0.5028,0.3612,0.3446,0.4346,0.4247,0.3595,0.2867,0.2972],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.026999999999999996,0.0993,0.0358,0.07910000000000003,0.07519999999999999,0.12219999999999998,0.1306,0.12250000000000003,0.17219999999999996,0.1693,0.18389999999999998,0.2117,0.1662,0.2136,0.23640000000000003,0.17910000000000004,0.17400000000000002,0.1443,0.10370000000000001,0.14909999999999998,0.1076,0.09869999999999998,0.08469999999999997,0.06320000000000003,0.08439999999999998,0.08859999999999998,0.08559999999999998,0.0703,0.11620000000000003,0.14030000000000004,0.14070000000000002,0.13400000000000004,0.129,0.05650000000000002,0.09059999999999999,0.17090000000000002,0.11930000000000002,0.1504,0.14740000000000003,0.11980000000000002,0.13169999999999998,0.10819999999999999,0.10519999999999999,0.1063,0.12940000000000002,0.07909999999999998,0.09400000000000003,0.07919999999999999,0.07949999999999999,0.14229999999999998,0.1641,0.1429,0.13219999999999998,0.07799999999999999,0.06609999999999999,0.10529999999999998,0.09330000000000002,0.1689,0.16699999999999998,0.12069999999999997,0.15480000000000002,0.1585,0.20059999999999997,0.20229999999999998,0.14880000000000002,0.1619,0.12630000000000002,0.21379999999999996,0.244,0.06120000000000003,0.07960000000000003,0.1322,0.1351,0.134,0.1425,0.1238,0.16079999999999997,0.07560000000000003,0.13020000000000004,0.08880000000000002,0.263,0.2862,0.1511,0.12530000000000002,0.2342,0.22510000000000002,0.1578,0.04250000000000001,0.07500000000000001],"cost_curve":[0.0011045,0.0017095,0.002531,0.0024676,0.00368825,0.00288965,0.00291565,0.0063895,0.003537,0.00395065,0.00479925,0.0056453,0.0060814,0.00687745,0.00450255,0.0043487,0.00585175,0.00643775,0.0072433,0.00707035,0.00396665,0.0050457,0.00535875,0.00601275,0.0069378,0.0076084,0.00504825,0.00491575,0.0055828,0.0062919,0.00705345,0.00770445,0.0049554,0.00633305,0.00670165,0.0075427,0.0082138,0.0082874,0.0055054,0.00652695,0.00681145,0.00746695,0.008348,0.00546305,0.0060696,0.00641615,0.0071241,0.0072696,0.00788905,0.009179,0.00549655,0.0060875,0.00719305,0.0077046,0.0080976,0.0087211,0.0058126,0.0063261,0.0069841,0.0076706,0.0083141,0.0090521,0.00997515,0.00713275,0.00783375,0.00826125,0.0089358,0.0096113,0.01065535,0.0074144,0.007922,0.008374,0.0091735,0.010136,0.01097405,0.0080966,0.00889565,0.0096012,0.01008775,0.0106048,0.00738585,0.00801135,0.00871435,0.0091408,0.0098623,0.0107293,0.059109,0.0081829,0.00898045,0.009925]},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","run_index":3,"reward":34.906699999999994,"baseline_reward":19.7597,"reference_reward":90.0,"gain":15.146999999999995,"normalized_reward":0.2156309172966584,"normalized_gain":0.21564543431619732,"cost_usd":0.599187,"latency_seconds":3.498999,"instance_count":90,"reward_curve":[0.192,0.2393,0.2262,0.2385,0.2209,0.2555,0.3029,0.3202,0.3121,0.3302,0.3001,0.3104,0.3075,0.3135,0.308,0.3285,0.3333,0.3245,0.3291,0.3524,0.3301,0.3472,0.3296,0.3946,0.3361,0.3301,0.3998,0.4356,0.5103,0.5744,0.582,0.4562,0.551,0.4695,0.4674,0.4456,0.436,0.4252,0.4213,0.4492,0.4465,0.4456,0.408,0.4095,0.3863,0.4347,0.4185,0.3829,0.3695,0.3327,0.3347,0.3561,0.4034,0.4091,0.4058,0.387,0.3733,0.4362,0.5068,0.4142,0.3986,0.3919,0.4855,0.4075,0.4097,0.4,0.3842,0.2992,0.3681,0.4365,0.4818,0.463,0.398,0.3495,0.3634,0.4462,0.3541,0.3728,0.3336,0.4138,0.3916,0.4184,0.4095,0.4716,0.5396,0.4312,0.4087,0.4133,0.4283,0.44],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.014500000000000013,0.0121,-0.0031999999999999806,0.042700000000000016,0.0756,0.12519999999999998,0.09,0.11759999999999998,0.059699999999999975,0.0819,0.0882,0.06520000000000001,0.11599999999999999,0.13110000000000002,0.1094,0.0975,0.12260000000000001,0.10499999999999998,0.1283,0.1453,0.11660000000000001,0.1863,0.11170000000000002,0.0968,0.19039999999999999,0.2251,0.2791,0.3672,0.3838,0.2477,0.3415,0.2668,0.24389999999999998,0.2317,0.2331,0.18380000000000002,0.224,0.2289,0.22010000000000002,0.253,0.16829999999999998,0.18789999999999998,0.15899999999999997,0.20729999999999998,0.19699999999999998,0.15200000000000002,0.1362,0.10400000000000001,0.11699999999999999,0.13460000000000003,0.1959,0.19640000000000002,0.1812,0.1618,0.17350000000000002,0.20009999999999997,0.3113,0.1986,0.1567,0.18050000000000002,0.26890000000000003,0.18649999999999997,0.2116,0.18450000000000003,0.15699999999999997,0.04400000000000004,0.15929999999999997,0.2153,0.2277,0.24950000000000003,0.15080000000000002,0.11919999999999997,0.1426,0.2085,0.11190000000000003,0.15990000000000001,0.08480000000000001,0.2141,0.1837,0.2008,0.1929,0.2615,0.3203,0.23080000000000003,0.2091,0.2116,0.1841,0.2178],"cost_curve":[0.001278,0.0021925,0.0028715,0.00258955,0.0030742,0.00219905,0.00295215,0.0054215,0.00234515,0.00292425,0.0038473,0.0038274,0.00480895,0.00560505,0.0062751,0.0027244,0.00336845,0.00409005,0.00500715,0.0056737,0.00631425,0.00706635,0.00375885,0.0047964,0.005136,0.0057106,0.0067157,0.0074363,0.0044675,0.00523555,0.00565365,0.0063868,0.00745635,0.00798295,0.0051868,0.00552535,0.0057299,0.00645295,0.00700295,0.00779145,0.0046178,0.00520085,0.00620695,0.006791,0.00757155,0.0081541,0.00511795,0.005974,0.0065975,0.006953,0.0076715,0.00815605,0.00876605,0.0059655,0.0065745,0.0069895,0.00781,0.00909855,0.009153,0.0057864,0.00698295,0.00714295,0.007808,0.00827805,0.00903,0.009994,0.00675645,0.00720805,0.00804105,0.0089241,0.0096152,0.0106862,0.00748725,0.0079193,0.00881235,0.00923935,0.00997945,0.00702435,0.0074519,0.0083114,0.00898995,0.00985695,0.0105325,0.01123955,0.00800155,0.00864865,0.00921265,0.00985765,0.01046765,0.01164975]},{"run_name":"icl-gemini-3-flash","task":"blind_spectrum_monitoring","run_index":4,"reward":31.14099999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":11.381299999999992,"normalized_reward":0.16201825196827954,"normalized_gain":0.16203376124532487,"cost_usd":0.7208449,"latency_seconds":4.171588,"instance_count":90,"reward_curve":[0.2273,0.2437,0.3124,0.3117,0.2678,0.333,0.3169,0.3317,0.2951,0.2944,0.2944,0.2944,0.2749,0.2749,0.2749,0.2795,0.2589,0.2589,0.2654,0.2661,0.2907,0.2661,0.2543,0.2543,0.2543,0.2543,0.2543,0.2543,0.2543,0.2584,0.2811,0.2719,0.2591,0.2692,0.2893,0.2893,0.2795,0.2526,0.2865,0.288,0.2952,0.312,0.2921,0.2892,0.296,0.3379,0.3333,0.3944,0.4045,0.3267,0.4135,0.363,0.3586,0.3237,0.3145,0.2811,0.2784,0.3417,0.3237,0.3208,0.3341,0.3654,0.3839,0.3977,0.4085,0.3942,0.3079,0.3465,0.4629,0.5654,0.5344,0.5,0.4657,0.4652,0.4961,0.4711,0.524,0.3979,0.3642,0.3558,0.5219,0.5176,0.5594,0.5535,0.5577,0.5,0.4717,0.3709,0.4484,0.4606],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,-0.004500000000000004,0.10070000000000001,0.08529999999999999,0.04369999999999999,0.12020000000000003,0.08960000000000001,0.1367,0.07299999999999998,0.08179999999999998,0.05399999999999999,0.06589999999999999,0.05559999999999998,0.026599999999999985,0.08289999999999997,0.08210000000000003,0.03500000000000003,0.03190000000000001,0.058900000000000036,0.018699999999999994,0.0889,0.06420000000000001,0.04130000000000003,0.04600000000000001,0.029900000000000038,0.02100000000000002,0.04490000000000002,0.04380000000000003,0.023100000000000037,0.05120000000000002,0.08290000000000003,0.06339999999999998,0.049600000000000005,0.0665,0.0658,0.0754,0.07660000000000003,0.011199999999999988,0.08919999999999997,0.06769999999999998,0.06880000000000003,0.1194,0.05240000000000003,0.06760000000000002,0.06869999999999998,0.11049999999999999,0.11179999999999998,0.16349999999999998,0.17120000000000002,0.098,0.19579999999999997,0.1415,0.15109999999999998,0.11099999999999999,0.08990000000000001,0.055900000000000005,0.07859999999999998,0.1056,0.12819999999999998,0.10519999999999996,0.0922,0.154,0.16730000000000003,0.1767,0.21039999999999998,0.1787,0.0807,0.09129999999999999,0.2541,0.3442,0.2803,0.2865,0.2185,0.2349,0.2753,0.23340000000000002,0.28180000000000005,0.18499999999999997,0.11540000000000003,0.15610000000000002,0.31400000000000006,0.29999999999999993,0.3428,0.3434,0.3384,0.2996,0.2721,0.16920000000000002,0.20420000000000002,0.2384],"cost_curve":[0.0012805,0.0019005,0.002892,0.0032322,0.0026871,0.0031733,0.0058425,0.007457,0.0035871,0.0049043,0.0056009,0.00633295,0.00717855,0.0038212,0.00457085,0.0053905,0.00648305,0.00734415,0.0077112,0.0048452,0.00597525,0.00633985,0.0074304,0.0077309,0.004832,0.00595805,0.0063306,0.00745715,0.00844275,0.0092753,0.006066,0.0073261,0.0082012,0.00865575,0.00947985,0.00717565,0.0080037,0.00878685,0.0091494,0.0063327,0.00720325,0.0083534,0.009381,0.00966805,0.0068854,0.00803195,0.008528,0.037546,0.0087425,0.00589975,0.0062562,0.00711365,0.0079686,0.0083526,0.00894805,0.009341,0.0068978,0.00695975,0.0075337,0.0083686,0.0088921,0.00975205,0.00657725,0.00736775,0.00804175,0.00871775,0.00946225,0.01077125,0.00710095,0.00768195,0.0085074,0.0091659,0.0098559,0.0107344,0.01179145,0.0084442,0.0091492,0.01042975,0.0110103,0.01232335,0.00900315,0.00927865,0.00970615,0.01036765,0.0110251,0.0119191,0.0094133,0.00940085,0.0105359,0.0112544]},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","run_index":0,"reward":6.175000000000001,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain":-1.5749999999999984,"normalized_reward":-0.3429319371727747,"normalized_gain":-0.13999999999999985,"cost_usd":2.9198341,"latency_seconds":6.217053,"instance_count":19,"reward_curve":[0.75,0.0,0.45,0.0,0.85,0.0,0.725,0.0,0.0,0.5,0.0,0.0,0.85,0.825,0.0,0.0,0.7,0.0,0.525],"baseline_reward_curve":[0.65,0.225,0.45,0.0,0.6,0.7,0.725,0.0,0.7,0.0,0.475,0.35,0.675,0.8,0.0,0.0,0.625,0.45,0.325],"gain_curve":[0.09999999999999998,-0.225,0.0,0.0,0.25,-0.7,0.0,0.0,-0.7,0.5,-0.475,-0.35,0.17499999999999993,0.02499999999999991,0.0,0.0,0.07499999999999996,-0.45,0.2],"cost_curve":[0.01481135,0.0282076,0.07530535,0.0289199,0.02841225,0.05106385,0.06435165,0.1541832,0.0652343,0.14888635,0.12623255,0.3555381,0.11817655,0.08675095,0.5353386,0.284703,0.17379885,0.27756795,0.30235175]},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","run_index":1,"reward":4.85,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain":-2.8999999999999995,"normalized_reward":-0.481675392670157,"normalized_gain":-0.2577777777777777,"cost_usd":1.5928528,"latency_seconds":3.839299,"instance_count":19,"reward_curve":[0.725,0.0,0.0,0.75,0.0,0.0,0.0,0.75,0.85,0.0,0.0,0.0,0.0,0.0,0.9,0.875,0.0,0.0,0.0],"baseline_reward_curve":[0.65,0.225,0.45,0.0,0.6,0.7,0.725,0.0,0.7,0.0,0.475,0.35,0.675,0.8,0.0,0.0,0.625,0.45,0.325],"gain_curve":[0.07499999999999996,-0.225,-0.45,0.75,-0.6,-0.7,-0.725,0.75,0.15000000000000002,0.0,-0.475,-0.35,-0.675,-0.8,0.9,0.875,-0.625,-0.45,-0.325],"cost_curve":[0.033588,0.01707875,0.04538535,0.03790085,0.05314095,0.0393083,0.036887,0.04767075,0.04575475,0.1252176,0.11216995,0.10566245,0.10705025,0.1150198,0.039116,0.0631776,0.45870095,0.07621795,0.03380555]},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","run_index":2,"reward":7.0,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain":-0.7499999999999991,"normalized_reward":-0.25654450261780093,"normalized_gain":-0.06666666666666658,"cost_usd":3.1726071,"latency_seconds":7.406324,"instance_count":19,"reward_curve":[0.55,0.0,0.675,0.575,0.725,0.625,0.0,0.0,0.775,0.0,0.0,0.0,0.875,0.0,0.875,0.0,0.0,0.675,0.65],"baseline_reward_curve":[0.65,0.225,0.45,0.0,0.6,0.7,0.725,0.0,0.7,0.0,0.475,0.35,0.675,0.8,0.0,0.0,0.625,0.45,0.325],"gain_curve":[-0.09999999999999998,-0.225,0.22500000000000003,0.575,0.125,-0.07499999999999996,-0.725,0.0,0.07500000000000007,0.0,-0.475,-0.35,0.19999999999999996,-0.8,0.875,0.0,-0.625,0.22500000000000003,0.325],"cost_curve":[0.0311773,0.08469345,0.06458285,0.08395955,0.06821505,0.10079825,0.0868707,0.0754652,0.07801555,0.43445805,0.42962355,0.6168739,0.08344895,0.09315075,0.0919016,0.09676605,0.1654994,0.23736505,0.2497419]},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","run_index":3,"reward":9.799999999999997,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain":2.049999999999998,"normalized_reward":0.03664921465968564,"normalized_gain":0.18222222222222204,"cost_usd":2.42311305,"latency_seconds":5.668922,"instance_count":19,"reward_curve":[0.0,0.0,0.775,0.0,0.0,0.725,0.0,0.725,0.875,0.8,0.85,0.675,0.0,0.8,0.75,0.55,0.75,0.7,0.825],"baseline_reward_curve":[0.65,0.225,0.45,0.0,0.6,0.7,0.725,0.0,0.7,0.0,0.475,0.35,0.675,0.8,0.0,0.0,0.625,0.45,0.325],"gain_curve":[-0.65,-0.225,0.325,0.0,-0.6,0.025000000000000022,-0.725,0.725,0.17500000000000004,0.8,0.375,0.32500000000000007,-0.675,0.0,0.75,0.55,0.125,0.24999999999999994,0.49999999999999994],"cost_curve":[0.138529,0.04881445,0.05441075,0.1048328,0.07599405,0.07650945,0.0529371,0.08176595,0.04747495,0.0779902,0.0625498,0.11405595,0.4882767,0.1055727,0.13480525,0.32565515,0.14565065,0.1692736,0.11801455]},{"run_name":"icl-gemini-3-flash","task":"codebase_adaptation","run_index":4,"reward":9.275,"baseline_reward":7.749999999999999,"reference_reward":19.0,"gain":1.5250000000000012,"normalized_reward":-0.01832460732984282,"normalized_gain":0.13555555555555568,"cost_usd":3.6665282,"latency_seconds":9.550352,"instance_count":19,"reward_curve":[0.475,0.0,0.675,0.725,0.9,0.675,0.875,0.85,0.75,0.525,0.875,0.7,0.0,0.0,0.0,0.0,0.375,0.0,0.875],"baseline_reward_curve":[0.65,0.225,0.45,0.0,0.6,0.7,0.725,0.0,0.7,0.0,0.475,0.35,0.675,0.8,0.0,0.0,0.625,0.45,0.325],"gain_curve":[-0.17500000000000004,-0.225,0.22500000000000003,0.725,0.30000000000000004,-0.02499999999999991,0.15000000000000002,0.85,0.050000000000000044,0.525,0.4,0.35,-0.675,-0.8,0.0,0.0,-0.25,-0.45,0.55],"cost_curve":[0.0323025,0.0676811,0.05882865,0.0670153,0.03376555,0.0634884,0.0346654,0.04760225,0.06846945,0.1280747,0.05442425,0.1523333,0.2071277,0.32039695,0.72741455,0.10376805,0.436361,0.9387085,0.1241006]},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","run_index":0,"reward":0.2179,"baseline_reward":0.1396,"reference_reward":3.24404,"gain":0.07830000000000001,"normalized_reward":-0.345226135120958,"normalized_gain":0.025221940188890755,"cost_usd":1.1133202,"latency_seconds":3.541409,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1521,0.0,0.0,0.0,0.0,0.0,0.0,0.0658,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,0.0,0.0,0.0,0.0,0.0,0.0,0.1165,0.0146,0.0,0.0],"gain_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1436,0.0,0.0,0.0,0.0,0.0,0.0,-0.05070000000000001,-0.0146,0.0,0.0],"cost_curve":[0.04811935,0.04820065,0.03985595,0.0406068,0.04516825,0.0352608,0.04186595,0.04188835,0.0475329,0.04655775,0.0496961,0.04935,0.0612005,0.06097565,0.05504085,0.0545832,0.06216055,0.05831265,0.1120894,0.11485455]},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","run_index":1,"reward":0.2115,"baseline_reward":0.1396,"reference_reward":3.24404,"gain":0.07189999999999999,"normalized_reward":-0.3480711612151817,"normalized_gain":0.023160376750718324,"cost_usd":1.5948854,"latency_seconds":7.332849,"instance_count":20,"reward_curve":[0.0,0.0021,0.0894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0812,0.0,0.0,0.0,0.0388],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,0.0,0.0,0.0,0.0,0.0,0.0,0.1165,0.0146,0.0,0.0],"gain_curve":[0.0,0.0021,0.0894,0.0,0.0,0.0,0.0,0.0,0.0,-0.0085,0.0,0.0,0.0,0.0,0.0,0.0812,-0.1165,-0.0146,0.0,0.0388],"cost_curve":[0.05836155,0.06588845,0.0564537,0.053049,0.07968215,0.0633676,0.0603339,0.0696476,0.0768827,0.0709055,0.0648884,0.06905785,0.1294649,0.11545625,0.1715958,0.05714865,0.08742345,0.08169505,0.0817079,0.081875]},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","run_index":2,"reward":0.37979999999999997,"baseline_reward":0.1396,"reference_reward":3.24404,"gain":0.24019999999999997,"normalized_reward":-0.2732558656436427,"normalized_gain":0.07737305278890878,"cost_usd":1.21299585,"latency_seconds":3.967263,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.1399,0.0,0.0,0.2035,0.0,0.0,0.0,0.0,0.0,0.0364,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,0.0,0.0,0.0,0.0,0.0,0.0,0.1165,0.0146,0.0,0.0],"gain_curve":[0.0,0.0,0.0,0.0,0.1399,0.0,0.0,0.2035,0.0,-0.0085,0.0,0.0,0.0,0.0364,0.0,0.0,-0.1165,-0.0146,0.0,0.0],"cost_curve":[0.0519366,0.0464826,0.0455189,0.04380225,0.07300395,0.04755085,0.0420696,0.0387176,0.057701,0.0436921,0.0466899,0.0400659,0.0669536,0.05621525,0.0558273,0.0472348,0.07000515,0.1157776,0.11676625,0.10698465]},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","run_index":3,"reward":1.3523,"baseline_reward":0.1396,"reference_reward":3.24404,"gain":1.2127000000000001,"normalized_reward":0.1590547400801942,"normalized_gain":0.39063405960495295,"cost_usd":0.77426435,"latency_seconds":3.696136,"instance_count":20,"reward_curve":[0.0,0.0,0.0142,0.0,0.0,0.0638,0.0,0.1603,0.0,0.0309,0.1732,0.0,0.0,0.0,0.0,0.0,0.2972,0.4407,0.1302,0.0418],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,0.0,0.0,0.0,0.0,0.0,0.0,0.1165,0.0146,0.0,0.0],"gain_curve":[0.0,0.0,0.0142,0.0,0.0,0.0638,0.0,0.1603,0.0,0.0224,0.1732,0.0,0.0,0.0,0.0,0.0,0.18070000000000003,0.4261,0.1302,0.0418],"cost_curve":[0.0508996,0.03896505,0.02865355,0.0327391,0.0414291,0.02879935,0.033603,0.0327639,0.05871575,0.0326458,0.03741155,0.035314,0.0476837,0.03721075,0.02754345,0.03340095,0.0529838,0.03639205,0.04841185,0.03869805]},{"run_name":"icl-gemini-3-flash","task":"cohort_studies","run_index":4,"reward":0.7191000000000001,"baseline_reward":0.1396,"reference_reward":3.24404,"gain":0.5795000000000001,"normalized_reward":-0.12242502911706386,"normalized_gain":0.18666812694076876,"cost_usd":1.5402107,"latency_seconds":4.036954,"instance_count":20,"reward_curve":[0.0,0.0603,0.1884,0.0,0.0,0.0,0.0,0.0314,0.0,0.0,0.0,0.0,0.0,0.154,0.0,0.0852,0.0,0.0939,0.0,0.1059],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0085,0.0,0.0,0.0,0.0,0.0,0.0,0.1165,0.0146,0.0,0.0],"gain_curve":[0.0,0.0603,0.1884,0.0,0.0,0.0,0.0,0.0314,0.0,-0.0085,0.0,0.0,0.0,0.154,0.0,0.0852,-0.1165,0.0793,0.0,0.1059],"cost_curve":[0.0593155,0.0565544,0.04988655,0.0498353,0.06441895,0.05360375,0.05241975,0.05142125,0.06057515,0.05934935,0.06541165,0.06713485,0.11335345,0.07505715,0.07571085,0.0554064,0.13335195,0.2474886,0.0751209,0.07479495]},{"run_name":"icl-gemini-3-flash","task":"database_exploration","run_index":0,"reward":14.26666666666667,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain":10.733333333333336,"normalized_reward":0.25338491295938115,"normalized_gain":0.29433272394881177,"cost_usd":0.4058726,"latency_seconds":1.432295,"instance_count":40,"reward_curve":[0.0,0.5333333333333333,0.0,0.0,0.0,0.9333333333333333,0.7333333333333334,0.0,0.7333333333333334,0.9333333333333333,0.6,0.8,0.0,0.8,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.0,0.0,0.19999999999999996,0.0,0.33333333333333337,0.19999999999999996,0.1333333333333333,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.2666666666666667,0.0],"gain_curve":[0.0,0.5333333333333333,0.0,0.0,-0.2666666666666667,0.6,0.7333333333333334,0.0,0.5333333333333334,0.9333333333333333,0.2666666666666666,0.6000000000000001,-0.1333333333333333,0.20000000000000007,0.0,0.9333333333333333,0.9333333333333333,0.0,0.2666666666666666,0.4666666666666667,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,-0.1333333333333333,0.9333333333333333,0.0,-0.2666666666666667,0.0],"cost_curve":[0.0152232,0.01880385,0.00310735,0.0080893,0.00518445,0.0056944,0.01777145,0.01286355,0.02124625,0.00379345,0.0185041,0.01426795,0.008213,0.019836,0.00372565,0.0046974,0.0053936,0.00904535,0.0068542,0.0115649,0.01816915,0.0103094,0.01088285,0.0098458,0.00576965,0.006567,0.00734255,0.0078821,0.0184779,0.01078815,0.0115805,0.0085079,0.0056281,0.0064099,0.0071434,0.00796195,0.008491,0.00930345,0.0098169,0.01111555]},{"run_name":"icl-gemini-3-flash","task":"database_exploration","run_index":1,"reward":13.400000000000002,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain":9.866666666666669,"normalized_reward":0.22823984526112193,"normalized_gain":0.27056672760511885,"cost_usd":0.45640945,"latency_seconds":1.399822,"instance_count":40,"reward_curve":[0.33333333333333337,0.0,0.0,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.9333333333333333,0.9333333333333333,0.8,0.9333333333333333,0.6,0.8666666666666667,0.0,0.0,0.9333333333333333,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.8,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.0,0.0,0.19999999999999996,0.0,0.33333333333333337,0.19999999999999996,0.1333333333333333,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.2666666666666667,0.0],"gain_curve":[0.33333333333333337,0.0,0.0,0.0,-0.2666666666666667,-0.33333333333333337,0.0,0.7333333333333334,-0.19999999999999996,0.9333333333333333,0.6,0.6000000000000001,0.8,0.0,0.8666666666666667,0.0,0.0,0.9333333333333333,0.19999999999999996,0.5333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,-0.1333333333333333,0.0,0.9333333333333333,0.5333333333333333,0.9333333333333333],"cost_curve":[0.01034385,0.0054897,0.00333925,0.0067394,0.0085635,0.04712355,0.00816745,0.0160535,0.01421295,0.005791,0.006549,0.0146307,0.00845275,0.0384786,0.00772025,0.00557555,0.006203,0.0069659,0.01150385,0.0084595,0.00918955,0.0206882,0.01082055,0.0056202,0.00625725,0.01054435,0.01204095,0.01405515,0.01029885,0.0110474,0.0139467,0.00541415,0.00608255,0.0067075,0.0073355,0.01267355,0.0093018,0.0098547,0.02211055,0.01205675]},{"run_name":"icl-gemini-3-flash","task":"database_exploration","run_index":2,"reward":14.000000000000002,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain":10.466666666666669,"normalized_reward":0.2456479690522244,"normalized_gain":0.28702010968921393,"cost_usd":0.38994115,"latency_seconds":1.395822,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.8,0.7333333333333334,0.8,0.9333333333333333,0.0,0.8,0.0,0.9333333333333333,0.8,0.0,0.0,0.8,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.0,0.0,0.19999999999999996,0.0,0.33333333333333337,0.19999999999999996,0.1333333333333333,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.2666666666666667,0.0],"gain_curve":[0.0,0.0,0.0,0.8,0.4666666666666667,0.4666666666666667,0.9333333333333333,0.0,0.6000000000000001,0.0,0.6,0.6000000000000001,-0.1333333333333333,-0.6,0.8,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.2666666666666666,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,-0.2666666666666667,0.9333333333333333],"cost_curve":[0.00588925,0.01012105,0.00237275,0.0076627,0.00924535,0.0073435,0.0046082,0.01136255,0.0139403,0.01231,0.0090599,0.0104599,0.01294,0.01435665,0.0171613,0.0097421,0.006624,0.0037117,0.00450325,0.00782785,0.00621465,0.01512495,0.043416,0.0050688,0.0118478,0.0069621,0.0080536,0.0086487,0.00928825,0.0098597,0.01073475,0.01146425,0.0085158,0.00551235,0.00633535,0.00695235,0.00753525,0.0083732,0.00912375,0.00966725]},{"run_name":"icl-gemini-3-flash","task":"database_exploration","run_index":3,"reward":16.6,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain":13.066666666666668,"normalized_reward":0.321083172147002,"normalized_gain":0.3583180987202925,"cost_usd":0.38146345,"latency_seconds":1.593344,"instance_count":40,"reward_curve":[0.0,0.8,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.6,0.9333333333333333,0.7333333333333334,0.8,0.0,0.0,0.8,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.7333333333333334,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.0,0.0,0.19999999999999996,0.0,0.33333333333333337,0.19999999999999996,0.1333333333333333,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.2666666666666667,0.0],"gain_curve":[0.0,0.8,0.0,0.0,0.6666666666666666,0.6,0.9333333333333333,0.0,-0.19999999999999996,0.6,0.6,0.5333333333333334,0.6666666666666667,-0.6,0.0,0.8,0.9333333333333333,0.9333333333333333,0.2666666666666666,-0.4,0.0,0.0,0.7333333333333334,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,-0.1333333333333333,0.9333333333333333,0.0,0.6666666666666666,0.9333333333333333],"cost_curve":[0.0099928,0.0045527,0.01063155,0.001729,0.0024612,0.00323755,0.0037899,0.00460475,0.00820545,0.0261121,0.0093167,0.01111745,0.00894155,0.0056537,0.00634925,0.01505945,0.0083336,0.0162962,0.0098314,0.00696435,0.0042502,0.01040175,0.01711385,0.0079871,0.008624,0.0202599,0.01670335,0.0055615,0.00648945,0.01187425,0.00848555,0.00924055,0.01003505,0.01052585,0.0183417,0.01489235,0.00594255,0.0065339,0.0071863,0.00783365]},{"run_name":"icl-gemini-3-flash","task":"database_exploration","run_index":4,"reward":16.866666666666667,"baseline_reward":3.533333333333333,"reference_reward":40.0,"gain":13.333333333333334,"normalized_reward":0.3288201160541587,"normalized_gain":0.3656307129798903,"cost_usd":0.44960575,"latency_seconds":1.544864,"instance_count":40,"reward_curve":[0.33333333333333337,0.6666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.0,0.4,0.8666666666666667,0.9333333333333333,0.8666666666666667,0.6666666666666667,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.2666666666666667,0.33333333333333337,0.0,0.0,0.19999999999999996,0.0,0.33333333333333337,0.19999999999999996,0.1333333333333333,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.0,0.0,0.2666666666666667,0.0],"gain_curve":[0.33333333333333337,0.6666666666666667,0.0,0.9333333333333333,0.6666666666666666,-0.33333333333333337,0.4,0.8666666666666667,0.7333333333333334,0.8666666666666667,0.33333333333333337,-0.19999999999999996,-0.1333333333333333,0.33333333333333337,0.0,0.9333333333333333,0.0,0.9333333333333333,0.2666666666666666,0.5333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0,0.8,0.0,0.9333333333333333,-0.2666666666666667,0.0],"cost_curve":[0.009809,0.0070773,0.015943,0.00296565,0.00353545,0.0116337,0.03557755,0.01380165,0.00655755,0.0056394,0.01699515,0.00702565,0.012191,0.0087846,0.03284755,0.00629355,0.01454495,0.0083759,0.00905685,0.0101445,0.01638465,0.00984805,0.0096729,0.00652405,0.00730145,0.0079719,0.0085643,0.0193439,0.0108732,0.01137615,0.00838375,0.0056905,0.00662425,0.01102155,0.00779425,0.0087257,0.00943475,0.01006725,0.0226765,0.01252675]},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","run_index":0,"reward":64.0,"baseline_reward":196.8,"reference_reward":1138.5,"gain":-132.8,"normalized_reward":-0.06926062294755697,"normalized_gain":-0.14102155675905279,"cost_usd":1.74149,"latency_seconds":2.296148,"instance_count":120,"reward_curve":[-1.0,-0.5,-1.0,4.0,-2.0,1.0,4.0,-4.0,34.0,-3.0,2.0,-11.5,1.0,1.0,-1.0,-1.0,4.0,-1.0,2.0,24.0,-1.0,-5.0,-1.0,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,0.0,0.5,1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-1.0,-0.5,-1.0,-1.0,-3.0,10.0,-1.0,-0.5,-1.0,1.0,1.0,-8.0,-8.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-1.0,-0.5,1.0,-1.0,-0.5,1.0,-2.0,1.0,8.0,0.5,-1.0,-1.0,4.0,-0.5,0.5,-1.0,1.0,1.0,0.5,7.0,-3.0,3.0,-1.0,1.0,1.0,1.0,-1.0,0.5,1.0,-5.0,-1.0,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,-1.0,1.0,-1.0,1.0,-1.0,0.5,-11.0,-1.0,1.0,3.0,2.0,-3.0,-3.0,-1.0,-1.0,17.0,4.0,0.5,-3.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-5.0,1.0,4.0,-1.0,36.5,-7.0,1.0,-2.5,2.5,8.0,-4.0,-6.0,6.0,-5.0,2.5,26.0,2.0,-5.0,2.0,-1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,0.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,-1.0,10.0,-1.0,-1.0,-1.0,-1.0,2.0,-2.0,10.0,-1.0,-0.5,-1.0,2.0,1.0,-8.0,-4.0,1.0,3.5,2.5,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,5.5,1.0,-1.0,1.0,19.5,0.5,-2.5,-1.0,1.0,-0.5,0.5,-2.0,1.0,1.0,0.5,2.5,-1.0,3.5,-2.2,1.0,7.0,1.0,-2.0,0.5,-1.0,-6.0,-1.0,0.5,0.5,1.0,2.0,4.0,3.0,0.5,2.0,-3.0,-1.0,-1.0,2.0,0.5,-18.0,-2.0,3.0,14.0,2.0,2.0,-3.0,-4.0,-2.0,100.0,4.0,0.5,-3.0],"gain_curve":[0.0,-1.5,0.0,0.0,3.0,0.0,0.0,-3.0,-2.5,4.0,1.0,-9.0,-1.5,-7.0,3.0,5.0,-2.0,4.0,-0.5,-2.0,-3.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.5,0.0,-3.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-4.0,0.0,-2.5,-1.5,0.0,0.0,0.0,-1.5,0.0,0.0,0.0,-1.5,0.0,-2.0,-6.0,0.0,-1.0,0.0,-11.5,0.0,1.5,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,4.5,-2.0,-0.5,1.2000000000000002,0.0,-6.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,-3.0,0.0,0.0,0.0,-3.0,4.0,0.0,2.0,-3.0,0.0,7.0,1.0,-2.0,-11.0,0.0,-5.0,0.0,3.0,1.0,-83.0,0.0,0.0,0.0],"cost_curve":[0.003595,0.00143,0.0046398,0.0060001,0.0065211,0.0054907,0.00871105,0.0081613,0.01117875,0.0142641,0.0175344,0.01338755,0.0089834,0.01189155,0.01494155,0.0175626,0.0207105,0.00907365,0.0122572,0.0153775,0.004312,0.03527565,0.0081303,0.00831745,0.00992065,0.0037199,0.0,0.00397555,0.0,0.00864285,0.01925165,0.0,0.00526975,0.01126705,0.0,0.0,0.01495755,0.0127033,0.0036649,0.0212873,0.01478915,0.0053239,0.00559145,0.0240407,0.031299,0.006755,0.0204938,0.00948265,0.0050426,0.01078835,0.0237087,0.0265237,0.0190027,0.0182959,0.02124795,0.0241824,0.0271133,0.02280425,0.0185587,0.02140145,0.0058737,0.0,0.0,0.0253681,0.0069273,0.0070549,0.03017695,0.0044475,0.00953505,0.0212414,0.0242731,0.02715345,0.0,0.0303061,0.0259141,0.01063715,0.00556815,0.0,0.0242227,0.00661075,0.0067583,0.0,0.0291728,0.0321673,0.0135663,0.02183995,0.0059804,0.02574035,0.0068939,0.02958585,0.0,0.025949,0.0335843,0.0292468,0.0,0.0,0.02527585,0.01379975,0.0220238,0.01559565,0.0,0.01644375,0.00849665,0.0357035,0.00938965,0.0060158,0.0,0.04023145,0.0073393,0.00757485,0.02394455,0.03436265,0.0183161,0.0417128,0.0137186,0.00716735,0.0552448,0.01749805,0.0,0.05698615]},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","run_index":1,"reward":129.5,"baseline_reward":196.8,"reference_reward":1138.5,"gain":-67.30000000000001,"normalized_reward":-0.0040800079609911374,"normalized_gain":-0.0714664967611766,"cost_usd":1.5652125,"latency_seconds":2.432126,"instance_count":120,"reward_curve":[34.0,2.0,1.0,-1.0,8.0,4.0,-1.0,-1.0,6.0,5.5,-4.0,43.0,15.0,-1.0,2.5,-6.0,-4.0,-1.0,-6.0,-0.5,-1.0,-0.5,0.0,1.0,-0.5,0.5,10.0,-3.0,2.0,0.5,-1.0,-1.0,0.5,-1.0,-1.0,0.5,-1.0,2.0,-0.5,0.5,-2.0,-0.5,-2.0,1.0,-1.0,10.0,-0.5,-1.0,-0.5,1.0,-1.0,15.0,-2.0,-8.0,-8.0,1.0,1.0,1.0,7.0,-1.0,5.0,-0.5,-4.0,2.5,-2.0,-0.5,0.5,0.5,-1.0,3.5,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,-1.0,-0.5,-0.5,0.5,0.5,1.0,0.5,-0.5,-1.0,16.0,-1.0,-0.5,1.0,0.5,-1.0,1.0,1.0,-0.5,-1.0,0.5,-0.5,3.0,-1.0,-28.0,-0.5,-1.0,0.5,-1.0,2.0,-2.0,1.0,0.5,4.0,-1.0,-1.0,19.0,-1.0,4.0,0.5,3.0,1.0,-3.0,1.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-5.0,1.0,4.0,-1.0,36.5,-7.0,1.0,-2.5,2.5,8.0,-4.0,-6.0,6.0,-5.0,2.5,26.0,2.0,-5.0,2.0,-1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,0.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,-1.0,10.0,-1.0,-1.0,-1.0,-1.0,2.0,-2.0,10.0,-1.0,-0.5,-1.0,2.0,1.0,-8.0,-4.0,1.0,3.5,2.5,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,5.5,1.0,-1.0,1.0,19.5,0.5,-2.5,-1.0,1.0,-0.5,0.5,-2.0,1.0,1.0,0.5,2.5,-1.0,3.5,-2.2,1.0,7.0,1.0,-2.0,0.5,-1.0,-6.0,-1.0,0.5,0.5,1.0,2.0,4.0,3.0,0.5,2.0,-3.0,-1.0,-1.0,2.0,0.5,-18.0,-2.0,3.0,14.0,2.0,2.0,-3.0,-4.0,-2.0,100.0,4.0,0.5,-3.0],"gain_curve":[35.0,1.0,2.0,-5.0,13.0,3.0,-5.0,0.0,-30.5,12.5,-5.0,45.5,12.5,-9.0,6.5,0.0,-10.0,4.0,-8.5,-26.5,-3.0,4.5,-2.0,2.0,-1.5,1.0,9.5,-2.0,1.5,1.5,-1.0,-1.5,-0.5,3.0,-1.5,0.0,1.0,2.0,0.5,-9.5,-1.0,0.5,-1.0,2.0,-3.0,12.0,-10.5,0.0,0.0,2.0,-3.0,14.0,6.0,-4.0,-9.0,-2.5,-1.5,2.0,8.0,0.0,4.0,-1.0,-4.5,3.5,-3.0,-1.5,-0.5,-5.0,-2.0,4.5,-0.5,-20.0,-1.0,2.0,0.5,-1.5,1.5,-1.5,1.5,-1.5,-0.5,0.0,-1.5,1.5,-4.0,1.2000000000000002,15.0,-8.0,-1.5,3.0,0.0,0.0,7.0,2.0,-1.0,-1.5,-0.5,-2.5,-1.0,-4.0,-28.5,-2.5,2.0,1.5,0.0,0.0,-2.5,19.0,2.5,1.0,-15.0,-3.0,17.0,2.0,8.0,2.5,-97.0,-3.0,-3.5,4.0],"cost_curve":[0.003263,0.00417115,0.0050707,0.0052079,0.00553995,0.01174225,0.00925635,0.00822835,0.01119155,0.01408945,0.0168187,0.0159843,0.0117972,0.01084705,0.01376575,0.0169501,0.01968105,0.01178385,0.0144996,0.0032613,0.00335495,0.003577,0.0163785,0.00925165,0.0048766,0.0,0.0275308,0.0142419,0.0134667,0.0,0.0037957,0.00819295,0.0,0.00897955,0.00968365,0.0,0.0283599,0.02377255,0.0027443,0.0,0.0094471,0.0034709,0.013316,0.0085488,0.02744645,0.04471415,0.00583595,0.00605605,0.0062006,0.00646565,0.01683525,0.0163088,0.0189666,0.02167995,0.02437425,0.0235567,0.019137,0.0181769,0.0211227,0.0237608,0.02652065,0.0071321,0.0194383,0.02204815,0.02142295,0.0058071,0.0,0.0,0.02518685,0.013735,0.0,0.00728935,0.00738945,0.0076465,0.0078361,0.00445475,0.00462385,0.0202121,0.0055413,0.00569735,0.0,0.0,0.00597655,0.0,0.0062177,0.0266347,0.02975405,0.0360021,0.00497735,0.02167335,0.0,0.0245169,0.0065804,0.00674295,0.00698,0.0072581,0.0,0.0074922,0.02365925,0.0165835,0.03973455,0.0060469,0.0125834,0.0,0.00658905,0.02087705,0.02254635,0.03235465,0.0,0.01731885,0.0486535,0.00556125,0.04398215,0.01410785,0.02615235,0.0,0.01592195,0.03396775,0.04281925,0.00618785]},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","run_index":2,"reward":137.0,"baseline_reward":196.8,"reference_reward":1138.5,"gain":-59.80000000000001,"normalized_reward":0.003383421235943881,"normalized_gain":-0.06350217691409155,"cost_usd":1.7223041,"latency_seconds":2.468501,"instance_count":120,"reward_curve":[37.5,-4.0,-4.0,6.0,-1.0,12.0,-8.0,-12.0,-23.0,-0.5,30.0,-1.0,-1.0,14.0,4.0,1.0,8.0,-6.0,15.0,8.0,0.0,-0.5,1.0,-6.0,-0.5,0.5,1.0,-7.0,0.5,0.5,-1.0,1.0,1.0,-5.0,-1.0,0.5,-1.0,2.0,-2.0,-1.0,-0.5,10.0,-1.0,10.0,-1.0,0.5,0.0,-1.0,-1.0,-2.0,1.0,1.0,-8.0,-1.0,-1.0,1.0,12.0,-1.0,11.0,-8.0,0.5,4.0,0.5,-1.0,0.5,1.0,1.0,-2.0,-9.0,1.0,1.0,22.0,-1.0,-1.0,-0.5,3.0,1.0,1.0,-1.0,1.0,1.0,1.0,0.5,1.0,16.0,0.5,0.5,1.0,-7.0,4.0,2.5,-1.0,-1.0,0.5,-3.0,1.0,3.0,-1.0,4.0,-1.0,0.5,-3.0,-15.0,1.0,-4.0,2.0,3.0,0.5,2.0,17.5,5.0,-1.0,-1.0,1.0,-1.0,0.5,-2.0,0.5,3.0,4.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-5.0,1.0,4.0,-1.0,36.5,-7.0,1.0,-2.5,2.5,8.0,-4.0,-6.0,6.0,-5.0,2.5,26.0,2.0,-5.0,2.0,-1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,0.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,-1.0,10.0,-1.0,-1.0,-1.0,-1.0,2.0,-2.0,10.0,-1.0,-0.5,-1.0,2.0,1.0,-8.0,-4.0,1.0,3.5,2.5,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,5.5,1.0,-1.0,1.0,19.5,0.5,-2.5,-1.0,1.0,-0.5,0.5,-2.0,1.0,1.0,0.5,2.5,-1.0,3.5,-2.2,1.0,7.0,1.0,-2.0,0.5,-1.0,-6.0,-1.0,0.5,0.5,1.0,2.0,4.0,3.0,0.5,2.0,-3.0,-1.0,-1.0,2.0,0.5,-18.0,-2.0,3.0,14.0,2.0,2.0,-3.0,-4.0,-2.0,100.0,4.0,0.5,-3.0],"gain_curve":[38.5,-5.0,-3.0,2.0,4.0,11.0,-12.0,-11.0,-59.5,6.5,29.0,1.5,-3.5,6.0,8.0,7.0,2.0,-1.0,12.5,-18.0,-2.0,4.5,-1.0,-5.0,-1.5,1.0,0.5,-6.0,0.0,1.5,-1.0,0.5,0.0,-1.0,-1.5,0.0,1.0,2.0,-1.0,-11.0,0.5,11.0,0.0,11.0,-3.0,2.5,-10.0,0.0,-0.5,-1.0,-1.0,0.0,0.0,3.0,-2.0,-2.5,9.5,0.0,12.0,-7.0,-0.5,3.5,0.0,0.0,-0.5,0.0,0.0,-7.5,-10.0,2.0,0.0,2.5,-1.5,1.5,0.5,2.0,1.5,0.5,1.0,0.0,0.0,0.5,-2.0,2.0,12.5,2.7,-0.5,-6.0,-8.0,6.0,2.0,0.0,5.0,1.5,-3.5,0.5,2.0,-3.0,0.0,-4.0,0.0,-5.0,-12.0,2.0,-3.0,0.0,2.5,18.5,4.0,14.5,-9.0,-3.0,-3.0,4.0,3.0,2.5,-102.0,-3.5,2.5,7.0],"cost_curve":[0.003417,0.00446335,0.0057476,0.00511065,0.00486135,0.01135455,0.00609835,0.00889825,0.01183345,0.0034861,0.0155381,0.0182161,0.010429,0.00971295,0.01256275,0.0152028,0.0182354,0.01763845,0.00970985,0.0126316,0.0155095,0.00428635,0.0045214,0.0202726,0.005405,0.0,0.0057231,0.01316985,0.0,0.0,0.0033248,0.0071227,0.00785025,0.0325371,0.0165451,0.0,0.01186095,0.01518635,0.01044505,0.0120626,0.00435275,0.0250637,0.01108865,0.0313406,0.0063342,0.0,0.01470825,0.00414735,0.0088138,0.0199378,0.0227885,0.02536235,0.0211167,0.0163386,0.0191518,0.0219822,0.0249992,0.0277484,0.01965885,0.0189774,0.0,0.01055175,0.0,0.0234299,0.0,0.006339,0.013183,0.0287889,0.02450115,0.0202221,0.00543525,0.0238353,0.0267354,0.0295885,0.00785985,0.01264985,0.0047455,0.00498955,0.02184855,0.0246345,0.00660505,0.00684865,0.0,0.00703225,0.0301919,0.0,0.0,0.00802345,0.0307654,0.0109834,0.02399055,0.02690345,0.02968495,0.0,0.0328322,0.00868575,0.0254809,0.00590695,0.0124014,0.006491,0.0,0.03534485,0.04838645,0.008629,0.04862875,0.0261708,0.02162085,0.0,0.02327685,0.04239925,0.02739205,0.0352059,0.0132302,0.0068926,0.0143373,0.0,0.031021,0.0,0.01660445,0.02611705]},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","run_index":3,"reward":91.0,"baseline_reward":196.8,"reference_reward":1138.5,"gain":-105.80000000000001,"normalized_reward":-0.0423922778385909,"normalized_gain":-0.11235000530954657,"cost_usd":2.20554585,"latency_seconds":2.290862,"instance_count":120,"reward_curve":[5.2,3.5,2.0,6.0,-2.0,-6.0,4.0,-3.5,-5.0,2.2,-4.0,-0.5,-1.0,1.0,-1.0,-1.0,28.0,-1.0,3.5,43.5,-0.5,-0.5,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-2.0,1.0,0.5,0.5,-1.0,0.5,-0.5,0.0,-3.0,-0.5,12.0,12.0,-0.5,-3.0,-0.5,0.5,-1.0,-0.5,-2.0,0.5,1.0,-7.0,-4.0,1.0,14.0,1.0,1.0,-8.0,-1.0,2.0,-1.0,1.0,3.5,0.5,16.0,-1.0,5.4,-2.2,2.0,-1.0,0.5,-2.0,5.2,0.5,-2.2,1.0,-2.2,1.0,-1.0,0.5,-5.2,2.5,0.5,0.5,-14.0,-1.0,-1.0,1.0,1.0,0.5,-2.2,-1.0,-1.0,8.0,-1.0,-4.0,-18.0,20.0,-2.0,0.5,0.5,-2.0,4.0,4.5,0.5,1.0,2.0,3.5,-1.0,-3.0,2.0,0.5,2.0,-4.0,1.0,-3.0,-1.0,-2.0,-2.0,-4.0,3.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-5.0,1.0,4.0,-1.0,36.5,-7.0,1.0,-2.5,2.5,8.0,-4.0,-6.0,6.0,-5.0,2.5,26.0,2.0,-5.0,2.0,-1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,0.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,-1.0,10.0,-1.0,-1.0,-1.0,-1.0,2.0,-2.0,10.0,-1.0,-0.5,-1.0,2.0,1.0,-8.0,-4.0,1.0,3.5,2.5,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,5.5,1.0,-1.0,1.0,19.5,0.5,-2.5,-1.0,1.0,-0.5,0.5,-2.0,1.0,1.0,0.5,2.5,-1.0,3.5,-2.2,1.0,7.0,1.0,-2.0,0.5,-1.0,-6.0,-1.0,0.5,0.5,1.0,2.0,4.0,3.0,0.5,2.0,-3.0,-1.0,-1.0,2.0,0.5,-18.0,-2.0,3.0,14.0,2.0,2.0,-3.0,-4.0,-2.0,100.0,4.0,0.5,-3.0],"gain_curve":[6.2,2.5,3.0,2.0,3.0,-7.0,0.0,-2.5,-41.5,9.2,-5.0,2.0,-3.5,-7.0,3.0,5.0,22.0,4.0,1.0,17.5,-2.5,4.5,-3.0,0.0,-1.0,-0.5,-1.5,0.0,-1.5,-1.0,1.0,0.0,-0.5,3.0,0.0,-1.0,2.0,-3.0,0.5,2.0,13.0,0.5,-2.0,0.5,-1.5,1.0,-10.5,-1.0,1.0,2.0,-9.0,-5.0,9.0,18.0,0.0,-2.5,-10.5,0.0,3.0,0.0,0.0,3.0,0.0,17.0,-2.0,4.4,-3.2,-3.5,-2.0,1.5,-3.0,-14.3,0.0,0.2999999999999998,2.0,-3.2,1.5,-1.5,2.5,-6.2,1.5,0.0,-2.0,-13.0,-4.5,1.2000000000000002,0.0,-6.0,-0.5,-0.20000000000000018,-1.5,0.0,14.0,0.0,-4.5,-18.5,19.0,-4.0,-3.5,-2.5,-2.5,2.0,7.5,1.5,2.0,0.0,3.0,17.0,-1.0,-1.0,-13.5,0.0,-6.0,4.0,1.0,1.0,-102.0,-6.0,-4.5,6.0],"cost_curve":[0.0034345,0.006444,0.0057323,0.0051672,0.0052572,0.01186135,0.00666555,0.0093029,0.01246815,0.0189643,0.01845225,0.0050189,0.01121045,0.0104654,0.01334105,0.01588605,0.01869545,0.01792585,0.0102577,0.01309065,0.003743,0.00395005,0.0127592,0.0144141,0.02165945,0.00590455,0.0025527,0.00916495,0.00646755,0.011053,0.0040084,0.0,0.0,0.0182335,0.0,0.0049691,0.0219767,0.0205547,0.0032352,0.01919015,0.023435,0.00524935,0.0230084,0.0060954,0.0,0.01288795,0.0030437,0.0106156,0.0,0.00785205,0.0179988,0.0210749,0.031092,0.0268679,0.01517425,0.0179042,0.02101805,0.0237076,0.026454,0.0222018,0.00852915,0.0092481,0.0,0.02104575,0.023664,0.0265813,0.0365338,0.0213797,0.0205864,0.0,0.02719735,0.0264129,0.0,0.02937805,0.00776985,0.0260566,0.0251519,0.0244276,0.0,0.0274912,0.03033125,0.0,0.0,0.0335197,0.02170655,0.0284686,0.01349785,0.00698245,0.0,0.0299859,0.0327804,0.03185445,0.02398435,0.0269188,0.0298228,0.0504559,0.0394902,0.0258031,0.0,0.0,0.02901945,0.0228176,0.02467375,0.0,0.0263328,0.02790675,0.01593515,0.01305695,0.03584725,0.02367335,0.0,0.0168221,0.04527755,0.0193448,0.03354035,0.02918165,0.03207805,0.043872,0.05858415,0.20136775]},{"run_name":"icl-gemini-3-flash","task":"exploitable_poker","run_index":4,"reward":52.7,"baseline_reward":196.8,"reference_reward":1138.5,"gain":-144.10000000000002,"normalized_reward":-0.08050552293760573,"normalized_gain":-0.1530211319953276,"cost_usd":2.3315275,"latency_seconds":2.730552,"instance_count":120,"reward_curve":[4.0,-5.0,4.0,31.0,-4.0,4.5,4.0,2.0,1.0,-5.0,-9.0,-1.0,2.2,41.5,-3.5,-1.0,-4.0,1.0,1.0,-1.0,1.0,0.5,10.0,1.0,-1.0,0.0,2.0,-2.0,-5.0,10.0,-10.0,-1.0,0.5,-1.0,-2.0,-4.0,-1.0,0.0,-1.0,-1.0,0.5,-0.5,1.0,0.5,0.5,-1.0,1.0,-1.0,-2.0,-0.5,-4.0,-8.0,-8.0,-1.0,2.0,1.0,-1.0,4.0,1.0,1.0,1.0,8.5,1.0,0.5,-4.5,-1.0,-1.0,-1.0,4.0,0.5,-4.0,-4.0,1.0,-2.0,2.0,1.0,-1.0,0.5,0.5,-1.0,0.5,-1.0,1.0,1.0,-1.0,0.5,3.0,-1.0,8.0,1.0,0.5,-1.0,1.0,-1.0,-1.0,1.0,-1.0,2.0,-1.0,-4.0,0.5,4.0,2.0,-2.0,-3.0,-2.0,17.0,0.5,0.5,-2.0,3.0,-1.0,4.0,1.0,0.5,-2.0,-1.0,-15.0,-4.0,-1.0],"baseline_reward_curve":[-1.0,1.0,-1.0,4.0,-5.0,1.0,4.0,-1.0,36.5,-7.0,1.0,-2.5,2.5,8.0,-4.0,-6.0,6.0,-5.0,2.5,26.0,2.0,-5.0,2.0,-1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,0.0,0.5,1.0,-4.0,0.5,0.5,-2.0,0.0,-1.0,10.0,-1.0,-1.0,-1.0,-1.0,2.0,-2.0,10.0,-1.0,-0.5,-1.0,2.0,1.0,-8.0,-4.0,1.0,3.5,2.5,-1.0,-1.0,-1.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,5.5,1.0,-1.0,1.0,19.5,0.5,-2.5,-1.0,1.0,-0.5,0.5,-2.0,1.0,1.0,0.5,2.5,-1.0,3.5,-2.2,1.0,7.0,1.0,-2.0,0.5,-1.0,-6.0,-1.0,0.5,0.5,1.0,2.0,4.0,3.0,0.5,2.0,-3.0,-1.0,-1.0,2.0,0.5,-18.0,-2.0,3.0,14.0,2.0,2.0,-3.0,-4.0,-2.0,100.0,4.0,0.5,-3.0],"gain_curve":[5.0,-6.0,5.0,27.0,1.0,3.5,0.0,3.0,-35.5,2.0,-10.0,1.5,-0.2999999999999998,33.5,0.5,5.0,-10.0,6.0,-1.5,-27.0,-1.0,5.5,8.0,2.0,-2.0,0.5,1.5,-1.0,-5.5,11.0,-10.0,-1.5,-0.5,3.0,-2.5,-4.5,1.0,0.0,0.0,-11.0,1.5,0.5,2.0,1.5,-1.5,1.0,-9.0,0.0,-1.5,0.5,-6.0,-9.0,0.0,3.0,1.0,-2.5,-3.5,5.0,2.0,2.0,0.0,8.0,0.5,1.5,-5.5,-2.0,-2.0,-6.5,3.0,1.5,-5.0,-23.5,0.5,0.5,3.0,0.0,-0.5,0.0,2.5,-2.0,-0.5,-1.5,-1.5,2.0,-4.5,2.7,2.0,-8.0,7.0,3.0,0.0,0.0,7.0,0.0,-1.5,0.5,-2.0,0.0,-5.0,-7.0,0.0,2.0,5.0,-1.0,-2.0,-4.0,16.5,18.5,2.5,-5.0,-11.0,-3.0,2.0,4.0,4.5,0.0,-101.0,-19.0,-4.5,2.0],"cost_curve":[0.0036245,0.004814,0.0059415,0.0051055,0.00519275,0.01159575,0.00653695,0.0092903,0.0121721,0.0153033,0.0183557,0.01047,0.0096583,0.0125879,0.0157367,0.01875055,0.0144314,0.01016035,0.0130669,0.0159906,0.00917565,0.0,0.02656105,0.0081846,0.0054779,0.0131218,0.01610925,0.01386295,0.03881115,0.01903345,0.02014715,0.00935305,0.0,0.01020375,0.01667655,0.0313077,0.01017495,0.0161151,0.01379605,0.01014975,0.0,0.00533975,0.01128595,0.0,0.0,0.01223275,0.0063729,0.01334335,0.01835605,0.0040979,0.018567,0.021467,0.02455165,0.0275348,0.01948385,0.01883265,0.02167465,0.0247146,0.0276361,0.02684195,0.01895585,0.02186685,0.01205775,0.0,0.02638805,0.0293641,0.02877755,0.0208081,0.02404765,0.0,0.0270922,0.0299928,0.0329187,0.0251714,0.02444595,0.02716935,0.03026215,0.0,0.0,0.07305405,0.0,0.02892,0.0059286,0.0061597,0.0262335,0.0,0.01434135,0.0309035,0.03411805,0.0089543,0.0,0.0307016,0.0263584,0.02929125,0.03205665,0.035131,0.01847235,0.03212065,0.01354805,0.03721005,0.0,0.02467365,0.02611205,0.03753475,0.04388455,0.02985745,0.05941405,0.0,0.0,0.0475608,0.0851168,0.0869751,0.0157351,0.01651065,0.0,0.03539245,0.01886135,0.06136585,0.0535458,0.0167858]},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","run_index":0,"reward":8.6363,"baseline_reward":5.2707,"reference_reward":12.0,"gain":3.3656000000000006,"normalized_reward":0.47464350976931613,"normalized_gain":0.5001411736733391,"cost_usd":0.5303461,"latency_seconds":3.831286,"instance_count":12,"reward_curve":[0.6729,0.6469,0.6831,0.6616,0.8483,0.7339,0.7255,0.712,0.7183,0.7155,0.7576,0.7607],"baseline_reward_curve":[0.5894,0.5484,0.4471,0.6401,0.436,0.3976,0.1928,0.3343,0.369,0.4875,0.4495,0.379],"gain_curve":[0.08350000000000002,0.09850000000000003,0.23600000000000004,0.021499999999999964,0.41230000000000006,0.3363,0.5327000000000001,0.3777,0.34930000000000005,0.22800000000000004,0.30810000000000004,0.38170000000000004],"cost_curve":[0.054498,0.0584629,0.0423806,0.04546955,0.03216605,0.03895035,0.034085,0.0418176,0.048193,0.0518205,0.04102395,0.0414786]},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","run_index":1,"reward":8.071900000000001,"baseline_reward":5.2707,"reference_reward":12.0,"gain":2.8012000000000015,"normalized_reward":0.38649319818201716,"normalized_gain":0.41626915132331765,"cost_usd":0.58691155,"latency_seconds":3.640511,"instance_count":12,"reward_curve":[0.6633,0.6472,0.6633,0.6521,0.6462,0.6351,0.6284,0.6359,0.6622,0.726,0.7576,0.7546],"baseline_reward_curve":[0.5894,0.5484,0.4471,0.6401,0.436,0.3976,0.1928,0.3343,0.369,0.4875,0.4495,0.379],"gain_curve":[0.07389999999999997,0.0988,0.2162,0.01200000000000001,0.2102,0.2375,0.4356,0.30160000000000003,0.2932,0.2385,0.30810000000000004,0.37560000000000004],"cost_curve":[0.0513327,0.0438487,0.0443789,0.0537549,0.0531831,0.0766153,0.0408491,0.0421011,0.0449067,0.04449925,0.0450815,0.0463603]},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","run_index":2,"reward":8.0168,"baseline_reward":5.2707,"reference_reward":12.0,"gain":2.7461,"normalized_reward":0.37788745373045746,"normalized_gain":0.4080810782696566,"cost_usd":0.78448885,"latency_seconds":3.437079,"instance_count":12,"reward_curve":[0.5898,0.5921,0.6075,0.6012,0.6743,0.6164,0.6662,0.7519,0.6812,0.8394,0.6762,0.7206],"baseline_reward_curve":[0.5894,0.5484,0.4471,0.6401,0.436,0.3976,0.1928,0.3343,0.369,0.4875,0.4495,0.379],"gain_curve":[0.00039999999999995595,0.04369999999999996,0.16040000000000004,-0.038900000000000046,0.2383,0.21879999999999994,0.47340000000000004,0.4176,0.31220000000000003,0.35190000000000005,0.2267,0.3416],"cost_curve":[0.05968795,0.0900516,0.0684739,0.05437755,0.0376241,0.05371405,0.0562379,0.0883712,0.05147715,0.056432,0.10090405,0.0671374]},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","run_index":3,"reward":9.173300000000001,"baseline_reward":5.2707,"reference_reward":12.0,"gain":3.9026000000000014,"normalized_reward":0.55851437674731,"normalized_gain":0.5799414500765312,"cost_usd":0.52725685,"latency_seconds":3.596125,"instance_count":12,"reward_curve":[0.6509,0.6469,0.7637,0.6148,0.8483,0.8004,0.7653,0.806,0.8009,0.8567,0.807,0.8124],"baseline_reward_curve":[0.5894,0.5484,0.4471,0.6401,0.436,0.3976,0.1928,0.3343,0.369,0.4875,0.4495,0.379],"gain_curve":[0.0615,0.09850000000000003,0.31660000000000005,-0.02529999999999999,0.41230000000000006,0.4028,0.5725,0.47170000000000006,0.43189999999999995,0.36920000000000003,0.35750000000000004,0.4334],"cost_curve":[0.046735,0.07448135,0.03965515,0.0342533,0.0378252,0.04267895,0.0420415,0.03801235,0.04526105,0.03966285,0.03950275,0.0471474]},{"run_name":"icl-gemini-3-flash","task":"sales_prediction","run_index":4,"reward":8.5059,"baseline_reward":5.2707,"reference_reward":12.0,"gain":3.2352000000000007,"normalized_reward":0.4542771018476581,"normalized_gain":0.4807632294592306,"cost_usd":0.51566285,"latency_seconds":3.354111,"instance_count":12,"reward_curve":[0.6039,0.6099,0.765,0.6836,0.6981,0.7268,0.7119,0.7398,0.7338,0.7186,0.7165,0.798],"baseline_reward_curve":[0.5894,0.5484,0.4471,0.6401,0.436,0.3976,0.1928,0.3343,0.369,0.4875,0.4495,0.379],"gain_curve":[0.014499999999999957,0.0615,0.3179,0.04349999999999998,0.26210000000000006,0.3292,0.5191,0.4055,0.3648,0.23110000000000003,0.267,0.41900000000000004],"cost_curve":[0.04852485,0.06073685,0.0433055,0.03669715,0.0458466,0.0382545,0.0419613,0.04039545,0.03471065,0.0407518,0.04292895,0.04154925]},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":0,"reward":37.08609999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":17.32639999999999,"normalized_reward":0.2466592633722004,"normalized_gain":0.24667320612241103,"cost_usd":3.4420942,"latency_seconds":13.687728,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3669,0.3601,0.3673,0.3518,0.3107,0.3397,0.3251,0.3607,0.3841,0.3626,0.485,0.3325,0.3562,0.3702,0.3601,0.3492,0.4484,0.4278,0.4184,0.4262,0.4262,0.4523,0.4523,0.4523,0.4493,0.4554,0.4493,0.4084,0.4084,0.3992,0.3712,0.3412,0.3525,0.3757,0.4033,0.4094,0.4111,0.4031,0.411,0.4347,0.4217,0.4069,0.3843,0.4226,0.4921,0.496,0.4645,0.4695,0.381,0.3804,0.3614,0.3804,0.4005,0.403,0.3828,0.3501,0.3206,0.2869,0.3095,0.3929,0.3814,0.3803,0.3717,0.3532,0.3948,0.3903,0.3919,0.4056,0.4889,0.4438,0.4267,0.3822,0.4223,0.3127,0.362,0.5337,0.5158,0.5096,0.4159,0.3728,0.4015,0.3108,0.7714,0.7714,0.6962,0.7429,0.6043],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1405,0.13599999999999998,0.15450000000000003,0.1245,0.11569999999999997,0.11760000000000001,0.11249999999999999,0.12030000000000002,0.1556,0.14329999999999998,0.2367,0.1405,0.15880000000000002,0.14629999999999999,0.13309999999999997,0.14270000000000002,0.201,0.226,0.2165,0.21320000000000003,0.2179,0.2279,0.21899999999999997,0.24289999999999998,0.23879999999999998,0.22420000000000004,0.24209999999999998,0.2102,0.1999,0.1897,0.16849999999999998,0.1177,0.13859999999999997,0.17279999999999998,0.1619,0.21209999999999998,0.19080000000000003,0.17670000000000002,0.21839999999999998,0.19499999999999998,0.20010000000000003,0.17959999999999998,0.15689999999999998,0.20109999999999997,0.2612,0.2627,0.23580000000000004,0.25179999999999997,0.1595,0.17290000000000003,0.1487,0.15580000000000002,0.1753,0.20320000000000002,0.14669999999999997,0.15460000000000002,0.10499999999999998,0.044999999999999984,0.09809999999999999,0.17630000000000004,0.16040000000000001,0.18220000000000003,0.15619999999999998,0.126,0.1396,0.18149999999999997,0.17070000000000002,0.15150000000000002,0.2754,0.19659999999999997,0.19640000000000002,0.1614,0.18460000000000001,0.07049999999999998,0.14909999999999998,0.28489999999999993,0.31610000000000005,0.3017000000000001,0.1983,0.15620000000000003,0.19140000000000001,0.09150000000000003,0.571,0.5718,0.49450000000000005,0.49870000000000003,0.38209999999999994],"cost_curve":[0.00477,0.011044,0.014052,0.021294,0.022406,0.02201,0.032004,0.027736,0.0157844,0.026981,0.0246434,0.0266798,0.0231682,0.0158668,0.0321054,0.0294246,0.026767,0.0266596,0.0259348,0.0192514,0.0230736,0.0261992,0.0256754,0.0232978,0.0339636,0.021966,0.0249524,0.0205084,0.0349388,0.019481,0.0226694,0.0399018,0.0383858,0.0338502,0.0279606,0.0430168,0.0385612,0.0224774,0.0281418,0.0287302,0.0245946,0.0348688,0.0307112,0.0265798,0.0474322,0.0263468,0.0301072,0.0346618,0.0317766,0.048009,0.0311656,0.0367402,0.0405348,0.0341492,0.0382198,0.042985,0.0499474,0.0338362,0.0368946,0.0330696,0.036856,0.033035,0.0522994,0.0386916,0.0345524,0.0378606,0.0338556,0.0375858,0.041344,0.037825,0.0506294,0.0399046,0.041691,0.235996,0.0419106,0.0401918,0.0389802,0.0378978,0.0346144,0.0388922,0.0434022,0.0408972,0.0459536,0.0528266,0.27456,0.0476064,0.0387924,0.0547222,0.05557,0.064186]},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":1,"reward":30.058099999999992,"baseline_reward":19.7597,"reference_reward":90.0,"gain":10.298399999999994,"normalized_reward":0.14660089124275671,"normalized_gain":0.14661668586267418,"cost_usd":4.0207574,"latency_seconds":16.561865,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2478,0.2669,0.2762,0.2657,0.2657,0.3209,0.3143,0.3262,0.4081,0.4068,0.4038,0.392,0.3428,0.3366,0.3507,0.3486,0.3522,0.4658,0.4433,0.4639,0.3892,0.4182,0.3278,0.3281,0.3888,0.3853,0.3759,0.4435,0.3953,0.3965,0.3966,0.4133,0.4151,0.4469,0.4486,0.4135,0.3829,0.3474,0.3474,0.3474,0.3489,0.1982,0.2356,0.2609,0.2709,0.3111,0.3089,0.3405,0.3062,0.2627,0.2731,0.3,0.3,0.2669,0.2564,0.2549,0.2549,0.2656,0.1997,0.2721,0.2951,0.3341,0.3687,0.3828,0.3553,0.4296,0.4196,0.404,0.404,0.404,0.404,0.404,0.3971,0.3988,0.3279,0.3507,0.4862,0.4604,0.4309,0.3003,0.2981,0.2069,0.2069,0.1727,0.1727,0.1727,0.1727,0.1727],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03609999999999999,0.040500000000000036,0.05210000000000001,0.0529,0.03839999999999999,0.1259,0.09220000000000003,0.11359999999999998,0.16770000000000002,0.1783,0.1845,0.14370000000000002,0.1508,0.13920000000000002,0.12680000000000002,0.12160000000000001,0.14570000000000002,0.21839999999999998,0.24150000000000002,0.262,0.1762,0.2099,0.10339999999999999,0.0948,0.17939999999999998,0.17479999999999998,0.14470000000000002,0.2363,0.1971,0.18800000000000003,0.18710000000000002,0.2106,0.19160000000000002,0.233,0.2457,0.17209999999999998,0.18560000000000001,0.1271,0.121,0.1548,0.10919999999999999,-0.023400000000000004,0.008300000000000002,0.03350000000000003,0.04939999999999997,0.0802,0.0756,0.11180000000000004,0.08850000000000002,0.04119999999999999,0.06560000000000002,0.08729999999999999,0.0754,0.041700000000000015,0.05660000000000001,0.01880000000000001,0.05940000000000001,0.04999999999999999,-0.042200000000000015,0.060700000000000004,0.07849999999999999,0.1131,0.17060000000000003,0.16729999999999998,0.1281,0.1744,0.21079999999999996,0.18280000000000002,0.14990000000000003,0.19050000000000003,0.15680000000000002,0.17370000000000002,0.1763,0.1611,0.08570000000000003,0.1378,0.23740000000000003,0.2607,0.223,0.08270000000000002,0.08149999999999999,-0.0032000000000000084,-0.012399999999999994,-0.027700000000000002,-0.026900000000000007,-0.028999999999999998,-0.07150000000000001,-0.049500000000000016],"cost_curve":[0.015742,0.01624,0.016998,0.022952,0.027446,0.024016,0.025992,0.029142,0.017907,0.0275154,0.031118,0.0290146,0.020255,0.0304754,0.0204178,0.0362892,0.0182176,0.0248328,0.0359534,0.0353418,0.0260354,0.0256662,0.0314024,0.0341642,0.0393766,0.037293,0.0198792,0.0324454,0.0259238,0.0216738,0.0393604,0.0423706,0.024397,0.0352492,0.0346476,0.038538,0.0299284,0.0412948,0.0313014,0.0303424,0.0416814,0.0456706,0.0402756,0.0257052,0.0255728,0.029261,0.0245968,0.0278946,0.0322186,0.0485394,0.0338916,0.0310524,0.0533248,0.0406014,0.0393606,0.0830854,0.0624504,0.0641794,0.0658748,0.073898,0.0317414,0.02773,0.0326318,0.0421714,0.0334224,0.0380704,0.0440726,0.042168,0.0482326,0.0479304,0.0537952,0.052341,0.0603132,0.0592722,0.286604,0.0588492,0.0664624,0.0750558,0.05625,0.0472176,0.0516874,0.0564294,0.0627362,0.0721734,0.084191,0.0788376,0.0877014,0.0974066,0.0972944,0.0916778]},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":2,"reward":32.506399999999985,"baseline_reward":19.7597,"reference_reward":90.0,"gain":12.746699999999986,"normalized_reward":0.18145759478352455,"normalized_gain":0.18147274427928106,"cost_usd":3.5915776,"latency_seconds":12.403146,"instance_count":90,"reward_curve":[0.2482,0.2758,0.3076,0.2603,0.3132,0.2949,0.3618,0.3229,0.3431,0.3269,0.3694,0.348,0.348,0.3404,0.329,0.3963,0.3963,0.3926,0.349,0.3745,0.3385,0.3234,0.322,0.3105,0.3141,0.3135,0.3135,0.3135,0.3135,0.2964,0.2964,0.2964,0.2909,0.2922,0.2558,0.2558,0.3059,0.3546,0.3504,0.4247,0.3973,0.3973,0.4344,0.3888,0.3957,0.4061,0.4628,0.4727,0.4636,0.4558,0.4311,0.4769,0.4747,0.4097,0.3365,0.3193,0.3676,0.3422,0.3477,0.3287,0.3387,0.3677,0.32,0.3639,0.3442,0.2817,0.3025,0.313,0.3195,0.3112,0.4761,0.4148,0.4148,0.3781,0.4357,0.3863,0.3903,0.3954,0.3783,0.4291,0.4554,0.5046,0.4868,0.4793,0.4784,0.3718,0.3645,0.3601,0.2769,0.2782],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.027599999999999986,0.09589999999999999,0.033899999999999986,0.08909999999999998,0.0821,0.1345,0.1279,0.12100000000000002,0.11430000000000001,0.129,0.11949999999999997,0.12869999999999998,0.09209999999999999,0.137,0.1989,0.1724,0.1656,0.1425,0.1271,0.13670000000000002,0.12150000000000002,0.10900000000000001,0.10219999999999999,0.0897,0.0802,0.1041,0.10300000000000001,0.08230000000000001,0.0892,0.09820000000000001,0.0879,0.0814,0.08950000000000002,0.03230000000000002,0.04190000000000002,0.10300000000000001,0.11320000000000002,0.15309999999999999,0.20440000000000003,0.1709,0.2047,0.1947,0.1672,0.1684,0.17870000000000003,0.2413,0.24180000000000001,0.2303,0.2271,0.21339999999999998,0.25539999999999996,0.2672,0.197,0.11190000000000003,0.09409999999999996,0.16779999999999998,0.1061,0.1522,0.11309999999999998,0.0968,0.15630000000000002,0.10340000000000002,0.1429,0.1461,0.06620000000000001,0.07529999999999998,0.05780000000000002,0.11069999999999999,0.08999999999999997,0.22200000000000003,0.2013,0.1676,0.1478,0.21489999999999998,0.14859999999999998,0.14809999999999998,0.18249999999999997,0.12950000000000003,0.2294,0.24750000000000003,0.28700000000000003,0.2702,0.2692,0.2591,0.17140000000000002,0.1649,0.15839999999999999,0.03269999999999998,0.055999999999999994],"cost_curve":[0.007826,0.016834,0.02156,0.021898,0.027722,0.028154,0.03382,0.038778,0.0214156,0.0229862,0.0326806,0.0224896,0.028026,0.0243184,0.0257218,0.0305524,0.0374408,0.0315044,0.0211188,0.0245514,0.0299812,0.0333498,0.0195976,0.02726,0.0379644,0.0399428,0.024183,0.0354574,0.0232578,0.0265222,0.0223648,0.036683,0.0457974,0.0388442,0.0291006,0.0258212,0.0249336,0.0185158,0.022332,0.026332,0.0217924,0.0246044,0.0279226,0.023459,0.0263192,0.0293294,0.0367478,0.0269298,0.0456038,0.0243822,0.0272142,0.0302602,0.0328148,0.029105,0.032215,0.0281976,0.0313298,0.0372998,0.0303744,0.0332966,0.0290372,0.0321014,0.0358538,0.0321786,0.0351026,0.0383168,0.0345656,0.0382118,0.0342186,0.037547,0.047724,0.0363662,0.0395564,0.0354132,0.0396416,0.0526424,0.0389306,0.042253,0.038532,0.0418482,0.037701,0.0410294,0.0442194,0.0404182,0.0438066,0.0471756,0.271742,0.276596,0.27714,0.0448716]},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":3,"reward":32.9798,"baseline_reward":19.7597,"reference_reward":90.0,"gain":13.220099999999999,"normalized_reward":0.18819744016856724,"normalized_gain":0.18821246492398236,"cost_usd":3.7229006,"latency_seconds":15.227148,"instance_count":90,"reward_curve":[0.192,0.2393,0.2206,0.2321,0.2222,0.2403,0.2422,0.2889,0.2872,0.329,0.3207,0.3316,0.3171,0.3446,0.3333,0.3165,0.3064,0.3397,0.3313,0.3565,0.3435,0.3385,0.3477,0.3587,0.3561,0.3561,0.4015,0.3812,0.39,0.3849,0.3798,0.3798,0.3798,0.3949,0.403,0.3905,0.3905,0.3905,0.4169,0.4169,0.4341,0.467,0.4585,0.4585,0.4585,0.4585,0.4164,0.4164,0.4078,0.3348,0.3175,0.3596,0.3679,0.3679,0.3679,0.3547,0.3421,0.3693,0.3693,0.3693,0.3551,0.3835,0.3672,0.3224,0.315,0.315,0.3194,0.3314,0.3015,0.2818,0.2726,0.4422,0.444,0.3565,0.3793,0.4262,0.4454,0.4581,0.4081,0.4253,0.4067,0.4045,0.3969,0.4371,0.491,0.4469,0.4424,0.4449,0.4362,0.4349],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.008899999999999991,0.005700000000000011,-0.001899999999999985,0.027500000000000024,0.014899999999999997,0.09389999999999998,0.06510000000000002,0.1164,0.08029999999999998,0.1031,0.0978,0.09630000000000002,0.14129999999999998,0.11910000000000001,0.08250000000000002,0.1127,0.1248,0.10909999999999997,0.14170000000000002,0.13660000000000003,0.13470000000000001,0.1504,0.13170000000000004,0.12280000000000002,0.19210000000000002,0.1707,0.15880000000000002,0.17770000000000002,0.18160000000000004,0.17130000000000004,0.17030000000000003,0.19219999999999998,0.17950000000000002,0.1766,0.18760000000000002,0.1491,0.2196,0.1966,0.2077,0.27440000000000003,0.21880000000000002,0.23690000000000003,0.23120000000000002,0.23110000000000003,0.1949,0.1855,0.1745,0.1061,0.0998,0.13809999999999997,0.16040000000000001,0.1552,0.1433,0.1295,0.1423,0.1332,0.1738,0.1537,0.11320000000000002,0.1721,0.15060000000000004,0.10140000000000002,0.1169,0.0995,0.0922,0.07619999999999999,0.09269999999999998,0.06059999999999999,0.018500000000000016,0.2287,0.1968,0.12619999999999998,0.15850000000000003,0.18850000000000003,0.20320000000000002,0.2452,0.15930000000000002,0.22560000000000002,0.1988,0.18690000000000004,0.1803,0.22699999999999998,0.2717,0.24650000000000002,0.24280000000000002,0.24320000000000003,0.19199999999999998,0.2127],"cost_curve":[0.00882,0.011722,0.01457,0.022216,0.0211,0.01933,0.031274,0.02716,0.015268,0.0202346,0.0128148,0.0285152,0.0223614,0.0302978,0.0242822,0.0289266,0.0234172,0.0284458,0.0250284,0.0222114,0.0334658,0.0323642,0.0164834,0.071306,0.0293444,0.0186498,0.0354226,0.0258312,0.0416992,0.0407538,0.0215738,0.0250146,0.028183,0.0254154,0.0485542,0.0347506,0.0369392,0.0247598,0.0438704,0.0319968,0.0348754,0.043434,0.0296772,0.0334298,0.0299668,0.0337114,0.0299982,0.0336026,0.0443156,0.050416,0.0373886,0.0361696,0.037544,0.0369612,0.0395176,0.0343846,0.0581172,0.0346444,0.0459286,0.0598192,0.0384864,0.0581428,0.039238,0.0447126,0.0454058,0.0564804,0.0666098,0.0477946,0.0586262,0.0545472,0.0531132,0.0437452,0.0386712,0.033684,0.0363238,0.0389696,0.0353306,0.0379602,0.0407262,0.036517,0.0391128,0.269636,0.0377496,0.0506194,0.0439894,0.295718,0.0504854,0.0391372,0.0563632,0.0467292]},{"run_name":"icl-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":4,"reward":32.53219999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":12.77249999999999,"normalized_reward":0.181824912085878,"normalized_gain":0.18184005478336496,"cost_usd":4.4024186,"latency_seconds":20.689197,"instance_count":90,"reward_curve":[0.2273,0.2641,0.3414,0.3384,0.3207,0.394,0.378,0.3827,0.3633,0.4221,0.3762,0.3791,0.3824,0.3824,0.3629,0.3675,0.3496,0.3376,0.3363,0.4025,0.4082,0.423,0.4286,0.4286,0.4019,0.4019,0.4019,0.4019,0.4019,0.409,0.3948,0.3863,0.3303,0.3615,0.3702,0.4615,0.4615,0.4615,0.4615,0.4615,0.4615,0.4252,0.3477,0.3477,0.3842,0.4203,0.3909,0.3953,0.3483,0.3483,0.372,0.3684,0.3684,0.3435,0.3263,0.274,0.2817,0.3079,0.308,0.308,0.2774,0.2774,0.2778,0.3156,0.2778,0.2778,0.2778,0.2778,0.1985,0.1985,0.2343,0.197,0.4324,0.4455,0.4949,0.4709,0.4648,0.3825,0.1727,0.1727,0.1727,0.1727,0.4903,0.455,0.4319,0.3872,0.3638,0.4185,0.495,0.5274],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.015899999999999997,0.12969999999999998,0.11199999999999999,0.09659999999999999,0.18120000000000003,0.1507,0.18769999999999998,0.14120000000000002,0.20949999999999996,0.13579999999999998,0.15059999999999998,0.16310000000000002,0.13410000000000002,0.1709,0.1701,0.12570000000000003,0.1106,0.1298,0.15510000000000002,0.2064,0.2211,0.21559999999999999,0.22029999999999997,0.1775,0.16859999999999997,0.19249999999999998,0.1914,0.1707,0.20179999999999998,0.1966,0.17779999999999999,0.12079999999999999,0.1588,0.14669999999999997,0.24760000000000001,0.25860000000000005,0.22010000000000002,0.2642,0.24120000000000003,0.23510000000000003,0.23260000000000003,0.10800000000000001,0.12610000000000002,0.15689999999999998,0.19290000000000002,0.16940000000000002,0.1644,0.11499999999999999,0.11960000000000001,0.1543,0.1469,0.16090000000000002,0.13080000000000003,0.10169999999999998,0.04880000000000001,0.0819,0.0718,0.11249999999999999,0.09239999999999998,0.035499999999999976,0.06599999999999998,0.061200000000000004,0.09459999999999999,0.0797,0.062299999999999994,0.05059999999999998,0.02260000000000001,-0.010300000000000004,-0.022699999999999998,-0.019799999999999984,-0.016499999999999987,0.1852,0.2152,0.2741,0.2332,0.2226,0.1696,-0.0761,-0.026999999999999996,-0.03520000000000001,-0.044899999999999995,0.27370000000000005,0.2449,0.2126,0.1868,0.1642,0.2168,0.2508,0.30519999999999997],"cost_curve":[0.011626,0.01965,0.01548,0.028104,0.032034,0.021074,0.029502,0.035516,0.0244662,0.0239288,0.024908,0.0259246,0.034215,0.0152884,0.031967,0.0302176,0.0298234,0.021028,0.0240804,0.0239866,0.0230008,0.0292312,0.0353234,0.0244656,0.0354158,0.0232402,0.0403444,0.0337448,0.0258932,0.0326358,0.0251922,0.0284344,0.0304732,0.0282216,0.0396822,0.0398086,0.031505,0.028186,0.0316204,0.0278692,0.0452636,0.0286308,0.0326152,0.0366796,0.0537728,0.0516612,0.0340306,0.037765,0.0350582,0.0389166,0.0353038,0.0405464,0.0371178,0.0422862,0.0408618,0.0453446,0.0620724,0.047003,0.0448268,0.0496074,0.0491692,0.044395,0.0493856,0.0473856,0.0541524,0.0539286,0.054309,0.061104,0.0614066,0.0610672,0.0602418,0.316218,0.051654,0.0475398,0.0509416,0.289916,0.0442582,0.0500562,0.1134864,0.1580702,0.128652,0.1280452,0.0805884,0.0507916,0.0551292,0.059741,0.0490162,0.0553262,0.064638,0.0553336]},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":0,"reward":5.35,"baseline_reward":6.675,"reference_reward":19.0,"gain":-1.3250000000000002,"normalized_reward":-0.42931937172774864,"normalized_gain":-0.10750507099391483,"cost_usd":5.9177968,"latency_seconds":4.901572,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.75,0.5,0.525,0.0,0.0,0.7,0.25,0.6,0.275,0.9,0.85,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.575,0.65,0.7,0.85,0.0,0.0,0.0,0.9,0.0,0.95,0.85,0.0,0.0,0.65,0.0,0.55],"gain_curve":[0.0,0.0,0.0,0.17500000000000004,-0.15000000000000002,-0.17499999999999993,-0.85,0.0,0.7,0.25,-0.30000000000000004,0.275,-0.04999999999999993,0.0,0.0,0.0,-0.65,0.0,-0.55],"cost_curve":[0.10255,0.326325,0.2385066,0.1444188,0.3311874,0.374786,0.017907,0.1796604,0.279968,0.7338888,0.7781998,0.9374452,0.1558296,0.4721064,0.5629786,0.0895758,0.0661968,0.0874692,0.0387974]},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":1,"reward":5.275,"baseline_reward":6.675,"reference_reward":19.0,"gain":-1.3999999999999995,"normalized_reward":-0.4371727748691098,"normalized_gain":-0.11359026369168353,"cost_usd":1.407438,"latency_seconds":4.539275,"instance_count":19,"reward_curve":[0.875,0.0,0.0,0.0,0.0,0.0,0.0,0.85,0.0,0.0,0.85,0.0,0.0,0.0,0.95,0.85,0.0,0.9,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.575,0.65,0.7,0.85,0.0,0.0,0.0,0.9,0.0,0.95,0.85,0.0,0.0,0.65,0.0,0.55],"gain_curve":[0.875,0.0,0.0,-0.575,-0.65,-0.7,-0.85,0.85,0.0,0.0,-0.050000000000000044,0.0,-0.95,-0.85,0.95,0.85,-0.65,0.9,-0.55],"cost_curve":[0.052828,0.052096,0.0774236,0.072596,0.0570708,0.0955536,0.0950886,0.0950786,0.078876,0.0463414,0.101229,0.1134064,0.0532186,0.021489,0.0602274,0.1504422,0.0389896,0.1277304,0.0177528]},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":2,"reward":5.425,"baseline_reward":6.675,"reference_reward":19.0,"gain":-1.25,"normalized_reward":-0.42146596858638735,"normalized_gain":-0.10141987829614606,"cost_usd":5.1104776,"latency_seconds":5.341663,"instance_count":19,"reward_curve":[0.275,0.675,0.0,0.0,0.0,0.0,0.0,0.0,0.95,0.0,0.0,0.0,0.95,0.0,0.875,0.8,0.0,0.0,0.9],"baseline_reward_curve":[0.0,0.0,0.0,0.575,0.65,0.7,0.85,0.0,0.0,0.0,0.9,0.0,0.95,0.85,0.0,0.0,0.65,0.0,0.55],"gain_curve":[0.275,0.675,0.0,-0.575,-0.65,-0.7,-0.85,0.0,0.95,0.0,-0.9,0.0,0.0,-0.85,0.875,0.8,-0.65,0.0,0.35],"cost_curve":[0.3662482,0.2876546,0.1639526,0.1670534,0.0850874,0.1219544,0.1686156,0.185585,0.0920124,0.5126428,0.7086354,0.2049086,0.089069,0.4162362,0.1936586,0.2856302,0.2994794,0.5490628,0.212991]},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":3,"reward":2.7750000000000004,"baseline_reward":6.675,"reference_reward":19.0,"gain":-3.8999999999999995,"normalized_reward":-0.6989528795811517,"normalized_gain":-0.31643002028397565,"cost_usd":1.1521806,"latency_seconds":4.066156,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.925,0.0,0.0,0.925,0.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.575,0.65,0.7,0.85,0.0,0.0,0.0,0.9,0.0,0.95,0.85,0.0,0.0,0.65,0.0,0.55],"gain_curve":[0.0,0.0,0.0,-0.575,-0.65,-0.7,-0.85,0.925,0.0,0.0,0.025000000000000022,0.925,-0.95,-0.85,0.0,0.0,-0.65,0.0,-0.55],"cost_curve":[0.1845272,0.039559,0.0645418,0.0391304,0.0619812,0.023912,0.068927,0.0603998,0.1060436,0.128074,0.0669842,0.0736078,0.0775234,0.0559074,0.0190196,0.0208906,0.0177524,0.0194712,0.023928]},{"run_name":"icl-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":4,"reward":6.800000000000001,"baseline_reward":6.675,"reference_reward":19.0,"gain":0.1250000000000009,"normalized_reward":-0.27748691099476425,"normalized_gain":0.010141987829614677,"cost_usd":3.415284,"latency_seconds":4.237311,"instance_count":19,"reward_curve":[0.6,0.675,0.0,0.75,0.675,0.45,0.0,0.925,0.0,0.9,0.9,0.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.575,0.65,0.7,0.85,0.0,0.0,0.0,0.9,0.0,0.95,0.85,0.0,0.0,0.65,0.0,0.55],"gain_curve":[0.6,0.675,0.0,0.17500000000000004,0.025000000000000022,-0.24999999999999994,-0.85,0.925,0.0,0.9,0.0,0.925,-0.95,-0.85,0.0,0.0,-0.65,0.0,-0.55],"cost_curve":[0.149299,0.1347648,0.2405478,0.1369396,0.2354996,0.4106518,0.122953,0.0912118,0.1293858,0.0955006,0.129435,0.2340668,0.962726,0.086954,0.0774504,0.0584918,0.0603836,0.028554,0.0304686]},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","run_index":0,"reward":0.1308,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain":-0.6892999999999998,"normalized_reward":-0.3839451621220338,"normalized_gain":-0.2843717253727402,"cost_usd":1.9855558,"latency_seconds":7.882506,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0484,0.0,0.0,0.0214,0.0,0.0,0.0224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0386,0.0,0.0],"baseline_reward_curve":[0.0049,0.0061,0.0,0.0,0.0,0.0631,0.0,0.0001,0.0,0.0146,0.0329,0.1931,0.1414,0.0,0.0007,0.0,0.3629,0.0002,0.0,0.0001],"gain_curve":[-0.0049,-0.0061,0.0,0.0484,0.0,-0.0631,0.0214,-0.0001,0.0,0.0078,-0.0329,-0.1931,-0.1414,0.0,-0.0007,0.0,-0.3629,0.038400000000000004,0.0,-0.0001],"cost_curve":[0.085742,0.097196,0.091608,0.0839814,0.0856196,0.0874018,0.0890164,0.090467,0.0922698,0.0951988,0.0982442,0.1001114,0.102118,0.104387,0.1064298,0.1079644,0.1095472,0.1181416,0.1193224,0.120789]},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","run_index":1,"reward":0.409,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain":-0.41109999999999985,"normalized_reward":-0.260275434088747,"normalized_gain":-0.1695999075884716,"cost_usd":1.8173724,"latency_seconds":8.745135,"instance_count":20,"reward_curve":[0.0,0.0,0.3788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0007,0.0,0.0,0.0002,0.0,0.0293],"baseline_reward_curve":[0.0049,0.0061,0.0,0.0,0.0,0.0631,0.0,0.0001,0.0,0.0146,0.0329,0.1931,0.1414,0.0,0.0007,0.0,0.3629,0.0002,0.0,0.0001],"gain_curve":[-0.0049,-0.0061,0.3788,0.0,0.0,-0.0631,0.0,-0.0001,0.0,-0.0146,-0.0329,-0.1931,-0.1414,0.0,0.0,0.0,-0.3629,0.0,0.0,0.0292],"cost_curve":[0.075538,0.090316,0.0777744,0.0725616,0.0904558,0.0778736,0.0873472,0.075803,0.0910414,0.0922296,0.0811742,0.0863358,0.099979,0.0836412,0.1036972,0.089112,0.1152286,0.1156308,0.116409,0.095224]},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","run_index":2,"reward":0.0007000000000000001,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain":-0.8193999999999998,"normalized_reward":-0.4417792081936751,"normalized_gain":-0.33804467107271624,"cost_usd":2.0677308,"latency_seconds":7.992501,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0,0.0001,0.0,0.0,0.0002,0.0,0.0,0.0,0.0,0.0,0.0002,0.0001,0.0],"baseline_reward_curve":[0.0049,0.0061,0.0,0.0,0.0,0.0631,0.0,0.0001,0.0,0.0146,0.0329,0.1931,0.1414,0.0,0.0007,0.0,0.3629,0.0002,0.0,0.0001],"gain_curve":[-0.0049,-0.0061,0.0,0.0,0.0,-0.0631,0.0001,-0.0001,0.0001,-0.0146,-0.0329,-0.1929,-0.1414,0.0,-0.0007,0.0,-0.3629,0.0,0.0001,-0.0001],"cost_curve":[0.084836,0.11354,0.0933508,0.0872114,0.0885792,0.089991,0.0913894,0.0927936,0.0945062,0.0962488,0.0974812,0.0989216,0.1077504,0.1090988,0.1099316,0.1114316,0.1196698,0.120672,0.1294904,0.130837]},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","run_index":3,"reward":0.7455,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain":-0.07459999999999978,"normalized_reward":-0.11068929647839108,"normalized_gain":-0.03077633934833361,"cost_usd":1.8216556,"latency_seconds":8.523585,"instance_count":20,"reward_curve":[0.0024,0.0095,0.3851,0.0861,0.0,0.142,0.0,0.0651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0078,0.0089,0.0386],"baseline_reward_curve":[0.0049,0.0061,0.0,0.0,0.0,0.0631,0.0,0.0001,0.0,0.0146,0.0329,0.1931,0.1414,0.0,0.0007,0.0,0.3629,0.0002,0.0,0.0001],"gain_curve":[-0.0025,0.0033999999999999994,0.3851,0.0861,0.0,0.07889999999999998,0.0,0.065,0.0,-0.0146,-0.0329,-0.1931,-0.1414,0.0,-0.0007,0.0,-0.3629,0.0076,0.0089,0.0385],"cost_curve":[0.081864,0.111006,0.0885302,0.069976,0.0776342,0.0768022,0.0961754,0.0810076,0.1053054,0.0792966,0.0969386,0.0609736,0.107055,0.1070446,0.0875316,0.0844082,0.1124124,0.0888466,0.1152298,0.0936176]},{"run_name":"icl-gemini-3.1-pro-preview","task":"cohort_studies","run_index":4,"reward":0.0,"baseline_reward":0.8200999999999998,"reference_reward":3.24404,"gain":-0.8200999999999998,"normalized_reward":-0.44209038292273084,"normalized_gain":-0.338333457098773,"cost_usd":1.4674212,"latency_seconds":16.635931,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0049,0.0061,0.0,0.0,0.0,0.0631,0.0,0.0001,0.0,0.0146,0.0329,0.1931,0.1414,0.0,0.0007,0.0,0.3629,0.0002,0.0,0.0001],"gain_curve":[-0.0049,-0.0061,0.0,0.0,0.0,-0.0631,0.0,-0.0001,0.0,-0.0146,-0.0329,-0.1931,-0.1414,0.0,-0.0007,0.0,-0.3629,-0.0002,0.0,-0.0001],"cost_curve":[0.07183,0.0489924,0.0527434,0.0563782,0.0673978,0.0491278,0.052807,0.0765964,0.0749122,0.0568624,0.0604542,0.0788804,0.1192412,0.0569408,0.0826424,0.0863372,0.0973842,0.0864622,0.097511,0.09392]},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","run_index":0,"reward":10.8,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain":6.066666666666668,"normalized_reward":0.15280464216634435,"normalized_gain":0.17202268431001896,"cost_usd":1.5759886,"latency_seconds":6.859973,"instance_count":40,"reward_curve":[0.06666666666666665,0.7333333333333334,0.0,0.0,0.0,0.8,0.7333333333333334,0.0,0.8,0.0,0.8,0.7333333333333334,0.0,0.8,0.0,0.0,0.8,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.8666666666666667,0.0],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.6,0.5333333333333333,0.2666666666666667,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.4,0.4,0.33333333333333337,0.33333333333333337],"gain_curve":[-0.4,0.7333333333333334,0.0,0.0,0.0,0.4666666666666667,0.7333333333333334,0.0,0.20000000000000007,-0.5333333333333333,0.5333333333333333,0.7333333333333334,0.0,0.20000000000000007,0.0,0.0,0.8,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,-0.4666666666666667,0.5333333333333333,-0.4,0.5333333333333333,-0.33333333333333337],"cost_curve":[0.120374,0.083662,0.040244,0.0395412,0.0166338,0.0440576,0.0420748,0.0247734,0.0413896,0.0259742,0.0305292,0.0589604,0.02192,0.0369802,0.0415228,0.041305,0.0400508,0.0338298,0.052756,0.026488,0.0436884,0.0292944,0.0289344,0.0517256,0.0258258,0.0295014,0.0476566,0.0194964,0.0320694,0.0286752,0.0287108,0.030501,0.0392176,0.0425718,0.0278084,0.071383,0.0219858,0.0304138,0.0427582,0.0407038]},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","run_index":1,"reward":1.0666666666666669,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain":-3.6666666666666656,"normalized_reward":-0.12959381044487422,"normalized_gain":-0.10396975425330811,"cost_usd":0.7680738,"latency_seconds":5.558835,"instance_count":40,"reward_curve":[0.2666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.6,0.5333333333333333,0.2666666666666667,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.4,0.4,0.33333333333333337,0.33333333333333337],"gain_curve":[-0.19999999999999996,0.0,0.0,0.0,0.0,-0.33333333333333337,0.0,0.8,-0.6,-0.5333333333333333,-0.2666666666666667,0.0,0.0,-0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.4666666666666667,-0.4,-0.4,-0.33333333333333337,-0.33333333333333337],"cost_curve":[0.065118,0.045884,0.03415,0.065204,0.049344,0.0052328,0.0058248,0.033449,0.0199956,0.0172412,0.0125988,0.0126468,0.0281452,0.0362394,0.0127466,0.0136264,0.0191826,0.0215964,0.021674,0.0237676,0.012572,0.0151922,0.0082722,0.008884,0.026138,0.0107778,0.0114158,0.0120096,0.0126156,0.0132474,0.0066332,0.007227,0.0078068,0.0083948,0.0090066,0.0096506,0.0102264,0.0108282,0.0114542,0.0120542]},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","run_index":2,"reward":12.200000000000001,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain":7.466666666666669,"normalized_reward":0.19342359767891687,"normalized_gain":0.21172022684310024,"cost_usd":1.4160828,"latency_seconds":5.598328,"instance_count":40,"reward_curve":[0.06666666666666665,0.0,0.0,0.8,0.8666666666666667,0.8,0.0,0.0,0.8,0.0,0.0,0.8666666666666667,0.0,0.0,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.8666666666666667,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.6,0.5333333333333333,0.2666666666666667,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.4,0.4,0.33333333333333337,0.33333333333333337],"gain_curve":[-0.4,0.0,0.0,0.8,0.8666666666666667,0.4666666666666667,0.0,0.0,0.20000000000000007,-0.5333333333333333,-0.2666666666666667,0.8666666666666667,0.0,-0.6,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.8666666666666667,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,-0.4,-0.4,-0.33333333333333337,0.6],"cost_curve":[0.122116,0.069214,0.0653902,0.0280242,0.0298964,0.0326328,0.0124606,0.0389992,0.0433716,0.0188096,0.020539,0.0314964,0.0404192,0.033674,0.0318684,0.0328062,0.0267362,0.0240114,0.018311,0.0317872,0.0390826,0.0319534,0.0539562,0.0288972,0.03311,0.0314718,0.0301842,0.0272826,0.0300688,0.0353368,0.05166,0.0334086,0.0339344,0.0207004,0.0236862,0.0261042,0.0302156,0.04933,0.0293006,0.0238356]},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","run_index":3,"reward":17.0,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain":12.266666666666667,"normalized_reward":0.332688588007737,"normalized_gain":0.3478260869565218,"cost_usd":1.5668352,"latency_seconds":6.694212,"instance_count":40,"reward_curve":[0.33333333333333337,0.6666666666666667,0.6,0.6666666666666667,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.8,0.7333333333333334,0.7333333333333334,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.8666666666666667,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.8,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.6,0.5333333333333333,0.2666666666666667,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.4,0.4,0.33333333333333337,0.33333333333333337],"gain_curve":[-0.1333333333333333,0.6666666666666667,0.6,0.6666666666666667,0.0,-0.33333333333333337,0.8666666666666667,0.0,0.2666666666666667,0.2666666666666667,0.4666666666666667,0.7333333333333334,0.8666666666666667,-0.6,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.8666666666666667,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.8,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,-0.4666666666666667,0.5333333333333333,0.5333333333333333,0.6,0.6],"cost_curve":[0.090512,0.079634,0.0719424,0.0550886,0.024964,0.0406324,0.0306708,0.0182164,0.0264046,0.0389068,0.0477296,0.04383,0.0393514,0.03327,0.0300918,0.0214232,0.0262222,0.0264156,0.0449832,0.0222786,0.0563532,0.054559,0.0372524,0.0469884,0.0313932,0.0329856,0.0364832,0.0590958,0.0496778,0.0271336,0.0652772,0.0282148,0.0240926,0.0260662,0.0295114,0.0314928,0.0348208,0.029978,0.0252032,0.0276884]},{"run_name":"icl-gemini-3.1-pro-preview","task":"database_exploration","run_index":4,"reward":16.733333333333334,"baseline_reward":4.7333333333333325,"reference_reward":40.0,"gain":12.000000000000002,"normalized_reward":0.32495164410058036,"normalized_gain":0.34026465028355396,"cost_usd":1.2744034,"latency_seconds":5.711731,"instance_count":40,"reward_curve":[0.6,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.7333333333333334,0.0,0.9333333333333333,0.8666666666666667,0.8666666666666667,0.8,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.0],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.6,0.5333333333333333,0.2666666666666667,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.4,0.4,0.33333333333333337,0.33333333333333337],"gain_curve":[0.1333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.9333333333333333,0.4,0.0,0.9333333333333333,0.2666666666666667,0.33333333333333337,0.5333333333333333,0.0,0.0,0.33333333333333337,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.4666666666666667,-0.4,0.5333333333333333,-0.33333333333333337,-0.33333333333333337],"cost_curve":[0.059766,0.04971,0.0439036,0.0124952,0.0152392,0.0428962,0.0328122,0.0133168,0.030607,0.0327958,0.0310514,0.017012,0.0241148,0.0214238,0.0327724,0.0186766,0.0287804,0.0252244,0.0256678,0.028699,0.0281272,0.091782,0.029178,0.0164474,0.0311768,0.0236998,0.0292308,0.0480654,0.0325838,0.0241294,0.0266044,0.0300776,0.0491586,0.0228698,0.042198,0.0299506,0.0322944,0.0204172,0.03648,0.0429676]},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":0,"reward":74.5,"baseline_reward":43.5,"reference_reward":1138.5,"gain":31.0,"normalized_reward":-0.058811822071847944,"normalized_gain":0.028310502283105023,"cost_usd":3.8163942,"latency_seconds":7.534856,"instance_count":120,"reward_curve":[-1.0,-0.5,-1.0,3.0,-2.0,1.0,3.0,-1.0,27.0,-6.0,3.0,-6.0,6.0,3.0,-3.0,-0.5,6.0,-3.0,6.0,24.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-3.0,0.5,0.5,-3.0,-0.5,-1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-3.0,-6.0,-0.5,-0.5,-0.5,3.0,1.0,-3.0,-3.0,1.0,6.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,1.0,1.0,0.5,12.0,-3.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-0.5,-0.5,-1.0,0.5,0.5,1.0,-0.5,4.0,3.0,0.5,-0.5,-0.5,-1.0,1.0,-1.0,0.5,-11.0,-1.0,1.0,-1.0,-0.5,-3.0,-1.0,-0.5,-1.0,16.0,2.0,0.5,-1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,6.0,-0.5,1.0,1.0,-1.0,15.0,-2.0,1.0,-0.5,2.0,3.0,-1.0,-0.5,2.0,-2.0,2.5,9.5,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-1.0,12.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.0,10.0,-0.5,-0.5,-0.5,1.0,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.5,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,2.5,0.5,-0.5,-0.5,-1.0,-1.0,-1.0,0.5,-16.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,18.5,2.0,0.5,-1.0],"gain_curve":[0.0,0.0,0.0,-3.0,-1.5,0.0,2.0,0.0,12.0,-4.0,2.0,-5.5,4.0,0.0,-2.0,0.0,4.0,-1.0,3.5,14.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,-2.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,-16.0,0.0,0.0,0.0,2.0,0.0,3.0,5.0,-1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,2.0,0.0,7.0,-2.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.5,0.0,0.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,-5.0,2.0,0.0,0.0,-2.5,0.0,0.0,0.0],"cost_curve":[0.02225,0.006488,0.026538,0.037648,0.049122,0.05791,0.068612,0.0224766,0.0366112,0.0464434,0.0357392,0.0334298,0.0464868,0.0282116,0.036919,0.0123822,0.0442388,0.0328742,0.0477252,0.0467634,0.0097904,0.0118648,0.0096912,0.0104274,0.0112856,0.011868,0.0,0.0126584,0.0,0.0135448,0.0448302,0.0,0.0101132,0.0220264,0.0,0.0,0.0754038,0.0170998,0.1125234,0.0553642,0.0399636,0.014645,0.0153272,0.0163194,0.0098796,0.0219034,0.0250556,0.020374,0.0139522,0.0148084,0.0619334,0.0559338,0.0594496,0.0629552,0.052236,0.0704698,0.017427,0.061907,0.0566662,0.0673962,0.0185744,0.0,0.0,0.0196632,0.0129696,0.0137896,0.0217236,0.015112,0.0164922,0.0166664,0.0174026,0.0775174,0.0,0.013781,0.0694884,0.0536414,0.0188926,0.0,0.019777,0.020695,0.0142876,0.0,0.071464,0.0745934,0.0409704,0.0213012,0.0148656,0.0739686,0.0181734,0.0188196,0.0,0.0198858,0.020406,0.0884424,0.0,0.0,0.0533826,0.0187642,0.082622,0.0449162,0.0,0.0236744,0.0242746,0.0751478,0.0204306,0.0212788,0.0,0.1370174,0.0187834,0.0194816,0.0203998,0.020892,0.0440816,0.0232158,0.023678,0.0245742,0.147448,0.1174462,0.0,0.025557]},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":1,"reward":64.5,"baseline_reward":43.5,"reference_reward":1138.5,"gain":21.0,"normalized_reward":-0.06876306100109464,"normalized_gain":0.019178082191780823,"cost_usd":4.1397982,"latency_seconds":5.693979,"instance_count":120,"reward_curve":[13.5,1.0,1.0,-1.0,4.0,5.0,-1.0,-1.0,5.5,2.5,-1.0,29.0,1.0,-1.0,3.0,-2.5,-4.0,-1.0,-3.0,-0.5,-1.0,-0.5,0.0,-1.0,-0.5,0.5,10.0,-3.0,-1.0,0.5,-1.0,-0.5,0.5,-0.5,-0.5,0.5,-1.0,-1.0,-0.5,0.5,-1.0,-0.5,-5.0,-0.5,-0.5,10.0,-0.5,-1.0,-0.5,1.0,-1.0,6.0,-1.0,-4.0,-4.0,1.0,-0.5,1.0,1.0,-1.0,2.5,-0.5,-1.0,2.5,-1.0,-0.5,0.5,0.5,-1.0,3.0,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,-1.0,-0.5,-0.5,0.5,0.5,1.0,0.5,-0.5,-3.0,6.0,-1.0,-0.5,1.0,0.5,1.0,1.0,1.0,-0.5,-1.0,0.5,-0.5,-1.0,1.0,-11.0,-0.5,-0.5,0.5,-1.0,-5.0,-1.0,-1.0,0.5,2.0,-1.0,-1.0,16.0,-0.5,4.0,0.5,3.0,3.0,-1.0,1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,6.0,-0.5,1.0,1.0,-1.0,15.0,-2.0,1.0,-0.5,2.0,3.0,-1.0,-0.5,2.0,-2.0,2.5,9.5,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-1.0,12.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.0,10.0,-0.5,-0.5,-0.5,1.0,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.5,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,2.5,0.5,-0.5,-0.5,-1.0,-1.0,-1.0,0.5,-16.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,18.5,2.0,0.5,-1.0],"gain_curve":[14.5,1.5,2.0,-7.0,4.5,4.0,-2.0,0.0,-9.5,4.5,-2.0,29.5,-1.0,-4.0,4.0,-2.0,-6.0,1.0,-5.5,-10.0,0.0,0.5,0.5,-0.5,0.0,1.0,9.5,-2.5,-1.5,1.0,-1.0,-1.0,-0.5,0.5,-1.0,0.0,0.0,-0.5,0.5,-11.5,0.0,0.5,-4.5,0.0,0.0,12.0,-10.5,-0.5,0.0,1.5,-2.0,5.0,5.0,4.0,-6.0,0.0,0.0,2.0,2.0,0.0,3.0,-1.0,-1.5,3.0,-0.5,-1.5,1.0,1.0,-2.0,3.5,1.0,-6.5,-1.0,0.0,0.5,-1.5,1.5,-1.5,0.0,-2.5,1.5,0.0,-4.0,1.5,-4.0,-2.5,5.0,-3.0,-1.5,1.5,0.0,2.0,1.5,2.0,-1.0,-1.5,-0.5,0.0,0.0,-1.5,-11.5,0.0,0.0,1.5,0.0,-4.0,-1.5,15.0,1.5,3.0,0.0,-0.5,14.0,2.5,4.5,1.5,-15.5,1.0,-1.5,2.0],"cost_curve":[0.024518,0.030798,0.036154,0.045186,0.059566,0.068742,0.0355056,0.0307596,0.0414876,0.0349424,0.0388654,0.0475708,0.0276304,0.0395158,0.0495474,0.0325806,0.0442404,0.0469552,0.0357888,0.011019,0.0116556,0.0153798,0.0555696,0.0391024,0.0111364,0.0,0.06841,0.0252134,0.0099212,0.0,0.0110138,0.0114982,0.0,0.0123886,0.0129848,0.0,0.0434906,0.0257576,0.0100252,0.0,0.0225872,0.0118836,0.0412582,0.0147694,0.0154396,0.0673852,0.0125918,0.0131842,0.0137502,0.0147024,0.0667152,0.0489382,0.0590302,0.0702874,0.0513044,0.0621038,0.0179706,0.0682428,0.0575028,0.0667816,0.08178,0.0141648,0.0623302,0.0714438,0.073585,0.0145852,0.0,0.0,0.067963,0.0371934,0.0,0.019586,0.02031,0.129044,0.0145926,0.0152606,0.0160628,0.070137,0.019074,0.0199042,0.0,0.0,0.0209988,0.0,0.0216552,0.0713044,0.0748182,0.0830678,0.0224316,0.0232058,0.0,0.0168164,0.0176306,0.0185208,0.019019,0.0199352,0.0,0.0206316,0.0214556,0.0223118,0.11875,0.0193434,0.0202236,0.0,0.021052,0.1154198,0.0429828,0.0780214,0.0,0.1124586,0.024312,0.0251522,0.101188,0.02169,0.0941146,0.0,0.0513558,0.0873482,0.0217416,0.0224758]},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":2,"reward":88.5,"baseline_reward":43.5,"reference_reward":1138.5,"gain":45.0,"normalized_reward":-0.044880087570902574,"normalized_gain":0.0410958904109589,"cost_usd":3.7499724,"latency_seconds":6.274566,"instance_count":120,"reward_curve":[24.5,-1.0,-0.5,1.0,-1.0,3.0,-1.0,-3.0,-2.0,-0.5,27.0,-1.0,-0.5,6.0,3.0,1.0,3.0,-3.0,4.0,1.0,-0.5,-0.5,1.0,-3.0,-0.5,0.5,1.0,-2.0,0.5,0.5,-1.0,-0.5,-0.5,-1.0,-1.0,0.5,-0.5,-0.5,-1.0,-0.5,-0.5,10.0,-0.5,10.0,-0.5,0.5,-1.0,-1.0,-0.5,-0.5,1.0,1.0,-4.0,-1.0,-1.0,-0.5,6.0,-1.0,1.0,-4.0,0.5,2.0,0.5,-0.5,0.5,1.0,-0.5,-0.5,-0.5,-0.5,1.0,6.0,-0.5,-0.5,-0.5,3.0,-2.5,1.0,-1.0,-0.5,1.0,1.0,0.5,2.0,10.0,0.5,0.5,-0.5,-1.0,-1.0,-0.5,-0.5,-0.5,0.5,-1.0,-0.5,1.0,-1.0,2.0,-1.0,0.5,-1.0,-11.0,-1.0,-1.0,-0.5,-1.0,0.5,-0.5,17.0,-1.0,-1.0,-0.5,-1.0,-0.5,0.5,-1.0,0.5,3.0,4.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,6.0,-0.5,1.0,1.0,-1.0,15.0,-2.0,1.0,-0.5,2.0,3.0,-1.0,-0.5,2.0,-2.0,2.5,9.5,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-1.0,12.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.0,10.0,-0.5,-0.5,-0.5,1.0,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.5,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,2.5,0.5,-0.5,-0.5,-1.0,-1.0,-1.0,0.5,-16.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,18.5,2.0,0.5,-1.0],"gain_curve":[25.5,-0.5,0.5,-5.0,-0.5,2.0,-2.0,-2.0,-17.0,1.5,26.0,-0.5,-2.5,3.0,4.0,1.5,1.0,-1.0,1.5,-8.5,0.5,0.5,1.5,-2.5,0.0,1.0,0.5,-1.5,0.0,1.0,-1.0,-1.0,-1.5,0.0,-1.5,0.0,0.5,0.0,0.0,-12.5,0.5,11.0,0.0,10.5,0.0,2.5,-11.0,-0.5,0.0,0.0,0.0,0.0,2.0,7.0,-3.0,-1.5,6.5,0.0,2.0,-3.0,1.0,1.5,0.0,0.0,1.0,0.0,0.0,0.0,-1.5,0.0,1.5,0.0,-1.0,0.0,0.5,2.0,-2.0,0.5,-0.5,-2.5,2.0,0.5,-4.5,3.0,6.5,1.0,-0.5,-2.5,-2.0,-0.5,-1.0,0.5,0.0,1.5,-1.5,-1.0,0.0,-0.5,3.0,-3.5,0.0,-0.5,-10.5,0.0,0.0,0.5,-1.5,16.5,0.5,18.0,0.0,-0.5,-2.5,2.0,0.0,1.5,-19.5,-1.5,2.5,5.0],"cost_curve":[0.024628,0.031912,0.010558,0.042086,0.04769,0.059468,0.066238,0.022455,0.031429,0.009937,0.044303,0.0305632,0.0105148,0.0410276,0.0307332,0.0332796,0.0494966,0.0326674,0.0381784,0.0480996,0.0131352,0.0065608,0.007537,0.0213502,0.0130234,0.0,0.0355428,0.0434302,0.0,0.0,0.014548,0.008252,0.0087802,0.0095624,0.0324812,0.0,0.0121784,0.0129546,0.0285994,0.0150348,0.008313,0.054247,0.012457,0.0747058,0.0092126,0.0,0.0103312,0.0110194,0.0114076,0.0125498,0.0568046,0.0582204,0.047196,0.0563984,0.0656142,0.0110432,0.0517446,0.0607626,0.071455,0.0531296,0.0,0.0622816,0.0,0.0172766,0.0,0.0630862,0.0132344,0.0139166,0.0146166,0.0153148,0.0163008,0.0716422,0.0192668,0.0129726,0.0135128,0.0301874,0.067656,0.0185748,0.0788578,0.01399,0.01507,0.0325064,0.0,0.0727992,0.0853088,0.0,0.0,0.0158574,0.069637,0.0794034,0.021169,0.0218652,0.015357,0.0,0.0684272,0.0190608,0.06072,0.0213428,0.0971508,0.0184128,0.0,0.0194714,0.1313068,0.0409882,0.018017,0.0185634,0.0192596,0.0,0.020124,0.1619084,0.0368824,0.0801484,0.0214884,0.0455308,0.0236384,0.0,0.024611,0.0,0.0447064,0.061705]},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":3,"reward":75.5,"baseline_reward":43.5,"reference_reward":1138.5,"gain":32.0,"normalized_reward":-0.05781669817892327,"normalized_gain":0.029223744292237442,"cost_usd":4.048749,"latency_seconds":6.646206,"instance_count":120,"reward_curve":[2.0,3.0,2.5,10.0,-5.0,-3.0,3.0,-3.0,-5.0,2.5,-3.0,-0.5,-1.0,1.0,-1.0,-1.0,22.0,-1.0,3.0,20.0,-0.5,-0.5,-1.0,-0.5,0.0,-1.0,-1.0,-0.5,-0.5,-2.5,1.0,0.5,0.5,-1.0,0.5,-0.5,-0.5,-1.0,-0.5,-6.0,10.0,-0.5,-2.5,-0.5,0.5,-0.5,-0.5,-2.5,0.5,-0.5,-3.0,-1.0,1.0,5.0,1.0,1.0,-3.0,-1.0,2.5,-1.0,1.0,2.5,0.5,12.0,-2.5,-0.5,-0.5,-0.5,-0.5,0.5,1.0,1.0,0.5,-0.5,1.0,-0.5,-0.5,-0.5,0.5,-0.5,10.0,0.5,0.5,-1.0,-1.0,-0.5,1.0,1.0,0.5,-0.5,-0.5,-0.5,2.5,-2.0,3.0,-11.0,21.0,-1.0,0.5,0.5,-1.0,1.0,3.5,0.5,2.5,2.0,2.5,1.0,-3.0,-0.5,0.5,-1.0,-0.5,-0.5,-1.0,-1.0,-1.0,-0.5,-1.0,1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,6.0,-0.5,1.0,1.0,-1.0,15.0,-2.0,1.0,-0.5,2.0,3.0,-1.0,-0.5,2.0,-2.0,2.5,9.5,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-1.0,12.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.0,10.0,-0.5,-0.5,-0.5,1.0,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.5,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,2.5,0.5,-0.5,-0.5,-1.0,-1.0,-1.0,0.5,-16.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,18.5,2.0,0.5,-1.0],"gain_curve":[3.0,3.5,3.5,4.0,-4.5,-4.0,2.0,-2.0,-20.0,4.5,-4.0,0.0,-3.0,-2.0,0.0,-0.5,20.0,1.0,0.5,10.5,0.5,0.5,-0.5,0.0,0.5,-0.5,-1.5,0.0,-1.0,-2.0,1.0,0.0,-0.5,0.0,0.0,-1.0,0.5,-0.5,0.5,-18.0,11.0,0.5,-2.0,0.0,1.0,1.5,-10.5,-2.0,1.0,0.0,-4.0,-2.0,7.0,13.0,-1.0,0.0,-2.5,0.0,3.5,0.0,1.5,2.0,0.0,12.5,-2.0,-1.5,0.0,0.0,-1.5,1.0,1.5,-5.0,0.0,0.0,2.0,-1.5,0.0,-1.0,1.0,-2.5,11.0,0.0,-4.5,0.0,-4.5,0.0,0.0,-1.0,-0.5,0.0,-1.0,0.5,3.0,-1.0,2.5,-11.5,20.0,-0.5,1.5,-2.0,-1.5,1.5,4.0,1.5,3.5,3.0,2.0,17.0,-2.0,0.5,1.5,-0.5,-2.5,2.5,-0.5,0.0,-19.5,-2.5,-1.5,2.0],"cost_curve":[0.019834,0.031816,0.040358,0.045586,0.056854,0.060474,0.03529,0.0293402,0.0468016,0.0232248,0.0358742,0.0119578,0.0405008,0.0290296,0.039422,0.0419106,0.032031,0.0784994,0.0459842,0.0789368,0.0136076,0.0112258,0.0377296,0.0137858,0.0411656,0.0112658,0.0139262,0.0118184,0.0123406,0.0275494,0.014683,0.0,0.0,0.045432,0.0,0.0109158,0.012014,0.0179884,0.0135566,0.0302218,0.0580008,0.0121414,0.0272554,0.014284,0.0,0.0150464,0.0159086,0.0277638,0.0,0.0110896,0.0516186,0.0619176,0.057561,0.0545368,0.0649388,0.0602226,0.056504,0.0661024,0.0703174,0.0580116,0.0337486,0.036317,0.0,0.0681792,0.0630668,0.0173394,0.11889,0.0186536,0.0194258,0.0,0.0206462,0.0140826,0.0,0.0147512,0.0156232,0.0163674,0.0169616,0.0178558,0.0,0.0188502,0.083284,0.0,0.0,0.065452,0.0745164,0.0202884,0.0212746,0.0219108,0.0,0.0226252,0.0162178,0.016998,0.0753866,0.0857194,0.0401954,0.1142052,0.154908,0.0185692,0.0,0.0,0.0193362,0.0410984,0.0674178,0.0,0.0731042,0.0577052,0.0415404,0.0220964,0.1196192,0.0255984,0.0,0.0194312,0.0201254,0.0207256,0.0446372,0.0952568,0.0256272,0.0263474,0.019846,0.0207542]},{"run_name":"icl-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":4,"reward":79.0,"baseline_reward":43.5,"reference_reward":1138.5,"gain":35.5,"normalized_reward":-0.05433376455368693,"normalized_gain":0.032420091324200914,"cost_usd":4.1924002,"latency_seconds":6.324937,"instance_count":120,"reward_curve":[1.0,-0.5,3.0,22.0,-1.0,5.5,3.0,3.0,1.0,-1.0,-2.5,-1.0,2.5,24.0,-3.0,-1.0,-1.0,-0.5,3.0,-1.0,-0.5,0.5,2.0,-1.0,-0.5,0.0,-1.0,-1.0,-1.0,10.0,-3.0,-0.5,0.5,-0.5,-3.0,-3.0,-0.5,0.0,-1.0,-0.5,0.5,-0.5,-0.5,0.5,0.5,-0.5,1.0,-0.5,-2.0,-0.5,-1.0,-3.0,-3.0,-1.0,2.0,1.0,-1.0,6.0,1.0,-0.5,1.0,2.0,1.0,0.5,-0.5,-1.0,-0.5,-1.0,1.0,0.5,-2.0,-1.0,-0.5,-0.5,9.0,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,1.0,1.0,-1.0,0.5,4.0,-0.5,12.0,1.0,0.5,-3.0,-0.5,-0.5,-0.5,-0.5,-1.0,-0.5,-1.0,-0.5,0.5,4.0,2.0,-1.0,-3.0,-1.0,21.0,0.5,0.5,-0.5,-1.0,-1.0,3.0,3.0,0.5,-1.0,1.0,-15.0,-1.0,-0.5],"baseline_reward_curve":[-1.0,-0.5,-1.0,6.0,-0.5,1.0,1.0,-1.0,15.0,-2.0,1.0,-0.5,2.0,3.0,-1.0,-0.5,2.0,-2.0,2.5,9.5,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-1.0,12.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.0,10.0,-0.5,-0.5,-0.5,1.0,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.5,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,2.5,0.5,-0.5,-0.5,-1.0,-1.0,-1.0,0.5,-16.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,18.5,2.0,0.5,-1.0],"gain_curve":[2.0,0.0,4.0,16.0,-0.5,4.5,2.0,4.0,-14.0,1.0,-3.5,-0.5,0.5,21.0,-2.0,-0.5,-3.0,1.5,0.5,-10.5,0.5,1.5,2.5,-0.5,0.0,0.5,-1.5,-0.5,-1.5,10.5,-3.0,-1.0,-0.5,0.5,-3.5,-3.5,0.5,0.5,0.0,-12.5,1.5,0.5,0.0,1.0,1.0,1.5,-9.0,0.0,-1.5,0.0,-2.0,-4.0,3.0,7.0,0.0,0.0,-0.5,7.0,2.0,0.5,1.5,1.5,0.5,1.0,0.0,-2.0,0.0,-0.5,0.0,1.0,-1.5,-7.0,-1.0,0.0,10.0,-1.5,0.0,0.0,1.0,-2.5,1.5,-1.0,-4.0,2.0,-4.5,1.0,3.0,-2.5,11.0,1.5,0.0,-2.0,0.0,0.5,-1.0,-1.0,-2.0,0.0,0.0,-3.0,0.0,4.5,2.5,0.0,-2.0,0.0,20.5,16.5,1.5,0.5,0.0,-0.5,1.0,6.0,1.0,0.0,-17.5,-17.0,-1.5,0.5],"cost_curve":[0.019714,0.0064,0.035484,0.045122,0.05213,0.061116,0.069396,0.040418,0.046651,0.0455726,0.038222,0.037698,0.0489754,0.0398208,0.043323,0.058139,0.0421038,0.01154,0.0436462,0.0536336,0.0177566,0.0,0.0505086,0.0492194,0.0180184,0.0473916,0.00991,0.0107422,0.0114986,0.0766344,0.0268572,0.016721,0.0,0.010173,0.0229798,0.0263218,0.0136814,0.067037,0.0311944,0.0115186,0.0,0.013539,0.0131312,0.0,0.0,0.0141998,0.014836,0.0155522,0.0578888,0.0120208,0.0583908,0.0659298,0.0608942,0.056508,0.0668328,0.0618372,0.0567128,0.0680054,0.062762,0.0139674,0.02176,0.0641876,0.0423626,0.0,0.0189758,0.068709,0.0149954,0.0668018,0.018317,0.0,0.0804942,0.0684202,0.0168926,0.0174068,0.0776536,0.0207548,0.021607,0.0,0.0,0.0152094,0.0,0.023256,0.0169818,0.01773,0.0766384,0.0,0.0426216,0.0221378,0.0765062,0.0187364,0.0,0.0827378,0.0220168,0.022823,0.023479,0.0177412,0.0370968,0.0192456,0.0406472,0.0212296,0.0,0.0700134,0.0672806,0.0186314,0.1039112,0.0225832,0.1565142,0.0,0.0,0.0206074,0.0217696,0.0922908,0.0505902,0.0461402,0.0,0.0206356,0.0212518,0.1418446,0.0256356,0.0262498]},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","run_index":0,"reward":6.588099999999999,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain":2.6796999999999995,"normalized_reward":0.15474721601824218,"normalized_gain":0.33117059666814963,"cost_usd":0.8587716,"latency_seconds":9.40125,"instance_count":12,"reward_curve":[0.5559,0.5581,0.572,0.5621,0.5576,0.5475,0.5416,0.5484,0.5394,0.5332,0.5355,0.5368],"baseline_reward_curve":[0.437,0.2952,0.3043,0.4659,0.5507,0.0,0.5825,0.4115,0.2716,0.4693,0.0358,0.0846],"gain_curve":[0.11889999999999995,0.2629,0.26769999999999994,0.09620000000000006,0.006900000000000017,0.5475,-0.04090000000000005,0.13690000000000002,0.2678,0.06390000000000001,0.4997,0.45220000000000005],"cost_curve":[0.096208,0.060567,0.0590554,0.063738,0.0615832,0.0668566,0.0723944,0.0706046,0.0761026,0.074223,0.079613,0.0778258]},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","run_index":1,"reward":6.270099999999999,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain":2.3617,"normalized_reward":0.10508066909272644,"normalized_gain":0.2918705818379554,"cost_usd":0.850639,"latency_seconds":10.494393,"instance_count":12,"reward_curve":[0.2379,0.5581,0.572,0.5621,0.5576,0.5475,0.5416,0.5484,0.5394,0.5332,0.5355,0.5368],"baseline_reward_curve":[0.437,0.2952,0.3043,0.4659,0.5507,0.0,0.5825,0.4115,0.2716,0.4693,0.0358,0.0846],"gain_curve":[-0.1991,0.2629,0.26769999999999994,0.09620000000000006,0.006900000000000017,0.5475,-0.04090000000000005,0.13690000000000002,0.2678,0.06390000000000001,0.4997,0.45220000000000005],"cost_curve":[0.071002,0.066707,0.0625838,0.0581586,0.0635142,0.0883636,0.067076,0.072576,0.0706666,0.0761066,0.0742154,0.0796692]},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","run_index":2,"reward":6.469199999999999,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain":2.5607999999999995,"normalized_reward":0.1361769253596138,"normalized_gain":0.31647634584013046,"cost_usd":0.8249666,"latency_seconds":9.695009,"instance_count":12,"reward_curve":[0.437,0.5581,0.572,0.5621,0.5576,0.5475,0.5416,0.5484,0.5394,0.5332,0.5355,0.5368],"baseline_reward_curve":[0.437,0.2952,0.3043,0.4659,0.5507,0.0,0.5825,0.4115,0.2716,0.4693,0.0358,0.0846],"gain_curve":[0.0,0.2629,0.26769999999999994,0.09620000000000006,0.006900000000000017,0.5475,-0.04090000000000005,0.13690000000000002,0.2678,0.06390000000000001,0.4997,0.45220000000000005],"cost_curve":[0.079226,0.0591864,0.0555682,0.0605412,0.058583,0.0639404,0.0695084,0.0676728,0.0731668,0.0785906,0.0767534,0.0822294]},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","run_index":3,"reward":6.469199999999999,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain":2.5607999999999995,"normalized_reward":0.1361769253596138,"normalized_gain":0.31647634584013046,"cost_usd":0.8583486,"latency_seconds":9.12676,"instance_count":12,"reward_curve":[0.437,0.5581,0.572,0.5621,0.5576,0.5475,0.5416,0.5484,0.5394,0.5332,0.5355,0.5368],"baseline_reward_curve":[0.437,0.2952,0.3043,0.4659,0.5507,0.0,0.5825,0.4115,0.2716,0.4693,0.0358,0.0846],"gain_curve":[0.0,0.2629,0.26769999999999994,0.09620000000000006,0.006900000000000017,0.5475,-0.04090000000000005,0.13690000000000002,0.2678,0.06390000000000001,0.4997,0.45220000000000005],"cost_curve":[0.095472,0.0639348,0.0566728,0.0618714,0.0597284,0.0650782,0.0706678,0.0689462,0.074424,0.0799,0.0780466,0.0836064]},{"run_name":"icl-gemini-3.1-pro-preview","task":"sales_prediction","run_index":4,"reward":6.5434,"baseline_reward":3.9083999999999994,"reference_reward":12.0,"gain":2.6350000000000007,"normalized_reward":0.147765786308901,"normalized_gain":0.3256463493005093,"cost_usd":0.8564852,"latency_seconds":8.963074,"instance_count":12,"reward_curve":[0.5112,0.5581,0.572,0.5621,0.5576,0.5475,0.5416,0.5484,0.5394,0.5332,0.5355,0.5368],"baseline_reward_curve":[0.437,0.2952,0.3043,0.4659,0.5507,0.0,0.5825,0.4115,0.2716,0.4693,0.0358,0.0846],"gain_curve":[0.07419999999999999,0.2629,0.26769999999999994,0.09620000000000006,0.006900000000000017,0.5475,-0.04090000000000005,0.13690000000000002,0.2678,0.06390000000000001,0.4997,0.45220000000000005],"cost_curve":[0.093284,0.0572808,0.057523,0.0625478,0.0609428,0.0661604,0.0714382,0.0695444,0.0749722,0.08039,0.0784988,0.0839028]},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","run_index":0,"reward":49.69540000000001,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":29.934400000000004,"normalized_reward":0.4261791882002876,"normalized_gain":0.4261791882002876,"cost_usd":2.0247145,"latency_seconds":6.379304,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3509,0.3548,0.3646,0.3573,0.3434,0.3608,0.3596,0.3924,0.399,0.4003,0.5184,0.4563,0.4432,0.4731,0.48,0.4695,0.5051,0.4882,0.4821,0.5056,0.4966,0.534,0.5207,0.5224,0.4924,0.4815,0.5018,0.5395,0.5359,0.5409,0.5243,0.5172,0.5308,0.5651,0.5722,0.5321,0.5382,0.5928,0.5369,0.5593,0.5688,0.5586,0.5932,0.6333,0.6803,0.6531,0.6367,0.6689,0.6331,0.636,0.664,0.6365,0.6834,0.6291,0.6273,0.643,0.5746,0.6299,0.6244,0.6212,0.5916,0.6058,0.6272,0.6093,0.6616,0.6544,0.6277,0.6653,0.6848,0.6816,0.6521,0.6241,0.6128,0.6602,0.6651,0.7107,0.7421,0.7206,0.6574,0.6141,0.6052,0.56,0.5359,0.5455,0.5666,0.616,0.6075],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1245,0.1307,0.1518,0.13,0.14839999999999998,0.13870000000000002,0.14699999999999996,0.15200000000000002,0.1705,0.181,0.2701,0.2643,0.2458,0.24920000000000003,0.253,0.263,0.2577,0.2864,0.2802,0.2926000000000001,0.2883,0.30960000000000004,0.28740000000000004,0.31299999999999994,0.28190000000000004,0.25029999999999997,0.29460000000000003,0.3413,0.3274,0.33140000000000003,0.3216,0.29369999999999996,0.31690000000000007,0.3622000000000001,0.33080000000000004,0.3348,0.3179,0.3664,0.34430000000000005,0.3196,0.34719999999999995,0.3313,0.36579999999999996,0.41179999999999994,0.4494,0.41890000000000005,0.40800000000000003,0.45120000000000005,0.41159999999999997,0.4285,0.45130000000000003,0.41189999999999993,0.4582,0.4293,0.3912,0.4475,0.359,0.388,0.4129999999999999,0.40459999999999996,0.37060000000000004,0.4077,0.41169999999999995,0.38209999999999994,0.4064,0.4456,0.40650000000000003,0.4112,0.4709,0.4344,0.4218,0.4033,0.3751,0.41800000000000004,0.45220000000000005,0.4619,0.5424,0.5127,0.43979999999999997,0.39749999999999996,0.39509999999999995,0.34070000000000006,0.3355,0.3459,0.3649,0.3718,0.38530000000000003],"cost_curve":[0.003115,0.0063725,0.0055665,0.0074905,0.007559,0.0074825,0.008534,0.00853,0.0090865,0.0097475,0.010496,0.0107595,0.011028,0.011994,0.012297,0.0127675,0.012655,0.0128405,0.0131665,0.013362,0.0135655,0.014039,0.013952,0.014423,0.0150465,0.0152245,0.0154175,0.015598,0.016224,0.016399,0.016862,0.0166245,0.017093,0.0172835,0.017752,0.01795,0.0181355,0.0186115,0.018807,0.01913,0.019458,0.0195035,0.020122,0.02044,0.021048,0.021103,0.021298,0.0217435,0.021916,0.0219615,0.022147,0.0226055,0.0226485,0.0231295,0.0235925,0.0242805,0.0240625,0.024253,0.024714,0.0250395,0.0256525,0.0256675,0.026018,0.0263335,0.0262285,0.026822,0.1913375,0.030095,0.0275505,0.027876,0.0282065,0.0289495,0.0292445,0.0294445,0.0294795,0.0299375,0.029975,0.0301705,0.030804,0.030849,0.031177,0.031515,0.031838,0.031886,0.032917,0.0326565,0.033262,0.0331645,0.033495,0.034108]},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","run_index":1,"reward":47.163500000000006,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":27.402500000000003,"normalized_reward":0.39013226270305673,"normalized_gain":0.39013226270305673,"cost_usd":1.996725,"latency_seconds":6.06443,"instance_count":90,"reward_curve":[0.2072,0.2357,0.2546,0.275,0.2764,0.2634,0.2855,0.3435,0.3331,0.357,0.4014,0.4229,0.4101,0.4261,0.4442,0.4429,0.4268,0.4281,0.4107,0.4527,0.4487,0.507,0.4588,0.496,0.4304,0.4577,0.4838,0.5046,0.5006,0.5186,0.5368,0.5361,0.5185,0.5379,0.5347,0.598,0.592,0.594,0.6091,0.6132,0.638,0.6991,0.6688,0.6275,0.6362,0.6405,0.5984,0.5502,0.5339,0.5497,0.4767,0.4608,0.5125,0.5048,0.5607,0.5816,0.6318,0.5935,0.6049,0.6456,0.6069,0.6295,0.62,0.6455,0.6405,0.6164,0.591,0.578,0.5508,0.5039,0.4755,0.4984,0.5405,0.5878,0.6121,0.5865,0.6062,0.6457,0.6262,0.6238,0.6005,0.5814,0.5445,0.5349,0.6337,0.6117,0.6694,0.6627,0.6335,0.617],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.012500000000000011,0.042899999999999994,0.04860000000000003,0.052299999999999985,0.050600000000000034,0.058199999999999974,0.14850000000000002,0.11100000000000002,0.14439999999999997,0.16099999999999998,0.1944,0.19080000000000003,0.17779999999999999,0.2522,0.24550000000000002,0.20290000000000002,0.20109999999999997,0.20420000000000002,0.20529999999999998,0.24689999999999998,0.30510000000000004,0.2458,0.28769999999999996,0.20600000000000002,0.2244,0.2744,0.29410000000000003,0.2694000000000001,0.31139999999999995,0.33860000000000007,0.3276,0.30899999999999994,0.33520000000000005,0.3111999999999999,0.3841,0.3891,0.35259999999999997,0.41179999999999994,0.39289999999999997,0.4116,0.5065000000000001,0.4290999999999999,0.4058999999999999,0.4089,0.41309999999999997,0.3769,0.31930000000000003,0.2997000000000001,0.32099999999999995,0.259,0.23929999999999998,0.30499999999999994,0.2921,0.33609999999999995,0.3564,0.43200000000000005,0.35740000000000005,0.4094,0.42999999999999994,0.365,0.4180999999999999,0.4034,0.4245,0.44239999999999996,0.4008999999999999,0.36379999999999996,0.3228,0.34199999999999997,0.2827,0.22139999999999999,0.2845,0.2933,0.3575,0.3913,0.3488,0.364,0.4328000000000001,0.37739999999999996,0.42410000000000003,0.39260000000000006,0.3638,0.32789999999999997,0.32480000000000003,0.41440000000000005,0.4113,0.4698,0.46099999999999997,0.3893,0.3948],"cost_curve":[0.00475,0.007795,0.006533,0.007357,0.007008,0.007157,0.0074435,0.0082725,0.0084835,0.009385,0.009811,0.0101965,0.010265,0.0106115,0.0115075,0.0446125,0.0147685,0.012152,0.012843,0.012971,0.0130965,0.0143275,0.014225,0.014268,0.015264,0.015382,0.016073,0.0157655,0.0158865,0.0160795,0.0166955,0.0168835,0.017074,0.017117,0.0174505,0.017924,0.0181095,0.0183,0.019196,0.019656,0.0195255,0.0198435,0.086254,0.023069,0.0208,0.0205625,0.021031,0.021224,0.0218375,0.022298,0.0222055,0.0223785,0.022712,0.0233305,0.0235055,0.023686,0.024029,0.024337,0.02482,0.066202,0.027655,0.0253885,0.0260015,0.025904,0.0265475,0.026575,0.027158,0.027358,0.027816,0.028271,0.0283085,0.028504,0.0288495,0.0295925,0.029347,0.0296825,0.0299855,0.0305985,0.030661,0.030844,0.0311495,0.0314725,0.0317955,0.0321335,0.032459,0.0330645,0.033107,0.032867,0.033483,0.0337985]},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","run_index":2,"reward":44.404099999999985,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":24.643099999999983,"normalized_reward":0.3508463958769342,"normalized_gain":0.3508463958769342,"cost_usd":1.7839575,"latency_seconds":5.841244,"instance_count":90,"reward_curve":[0.2482,0.2811,0.3026,0.2903,0.3336,0.3201,0.384,0.3467,0.3478,0.356,0.3931,0.4109,0.4459,0.4548,0.449,0.4518,0.4519,0.4217,0.399,0.3659,0.3622,0.3696,0.3623,0.3639,0.3679,0.3859,0.3876,0.3772,0.3939,0.4678,0.487,0.4859,0.4995,0.4737,0.4551,0.4578,0.498,0.4848,0.4665,0.3641,0.4675,0.4701,0.5402,0.5896,0.5969,0.6166,0.6132,0.6272,0.4713,0.4678,0.5034,0.5559,0.5861,0.485,0.5052,0.5005,0.52,0.5174,0.5538,0.52,0.4893,0.5269,0.5568,0.5594,0.5364,0.477,0.5295,0.551,0.6308,0.6683,0.5904,0.5741,0.5562,0.5421,0.5456,0.5434,0.5414,0.5787,0.6242,0.5955,0.5703,0.6144,0.6851,0.7139,0.6943,0.6237,0.6319,0.6641,0.6508,0.6658],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.03290000000000001,0.09089999999999998,0.06390000000000001,0.10950000000000001,0.1073,0.1567,0.1517,0.1257,0.14339999999999997,0.1527,0.18239999999999998,0.22660000000000002,0.2065,0.257,0.25439999999999996,0.22800000000000004,0.1947,0.19250000000000003,0.1185,0.16040000000000001,0.1677,0.14930000000000002,0.1556,0.14350000000000002,0.1526,0.1782,0.1667,0.16269999999999998,0.2606,0.2888,0.2774,0.29000000000000004,0.271,0.2316,0.24389999999999998,0.29510000000000003,0.2434,0.2692,0.14379999999999998,0.24110000000000004,0.2775,0.3005,0.368,0.3696,0.38920000000000005,0.39169999999999994,0.3963,0.2371,0.2391,0.28569999999999995,0.3343999999999999,0.37859999999999994,0.2723,0.28059999999999996,0.27529999999999993,0.32020000000000004,0.2813,0.35829999999999995,0.3044,0.2474,0.3155,0.34019999999999995,0.33840000000000003,0.3383,0.26149999999999995,0.30229999999999996,0.29580000000000006,0.42200000000000004,0.4471,0.33630000000000004,0.3602000000000001,0.30900000000000005,0.3118,0.3248,0.30569999999999997,0.2992,0.3658,0.37539999999999996,0.39580000000000004,0.36240000000000006,0.39679999999999993,0.4685,0.5038,0.47500000000000003,0.4233,0.4323,0.46240000000000003,0.4066000000000001,0.44359999999999994],"cost_curve":[0.00367,0.0061875,0.005864,0.0067925,0.008304,0.008282,0.0089655,0.009994,0.010112,0.010673,0.010796,0.009564,0.009625,0.039805,0.0134815,0.012071,0.0124465,0.01022,0.010281,0.01096,0.0248525,0.0129285,0.01302,0.012909,0.010902,0.013114,0.012865,0.0126965,0.01316,0.013766,0.014457,0.014876,0.0156745,0.0157425,0.0157105,0.0163865,0.0167375,0.0177585,0.017866,0.017839,0.0182895,0.0182955,0.0188835,0.018989,0.0193975,0.019961,0.0197565,0.0204575,0.0190405,0.019259,0.020005,0.020681,0.021432,0.020562,0.0208175,0.0220465,0.022372,0.0224175,0.0230235,0.023104,0.0233245,0.024033,0.0246985,0.024831,0.0248115,0.02537,0.026058,0.026381,0.0267115,0.026917,0.0277,0.0274395,0.0277925,0.028073,0.028616,0.0285835,0.028834,0.029312,0.029775,0.0298375,0.030103,0.0303085,0.0304415,0.0309,0.0312505,0.0315835,0.0319865,0.0323545,0.032657,0.033127]},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","run_index":3,"reward":45.089400000000005,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":25.328400000000002,"normalized_reward":0.3606030837568872,"normalized_gain":0.3606030837568872,"cost_usd":1.988515,"latency_seconds":6.312154,"instance_count":90,"reward_curve":[0.192,0.2393,0.2285,0.2318,0.2505,0.2604,0.2812,0.3257,0.3024,0.3461,0.3477,0.3564,0.3496,0.3698,0.3517,0.355,0.3575,0.3666,0.3519,0.4296,0.3866,0.3937,0.3717,0.4145,0.4059,0.4023,0.3584,0.3288,0.4717,0.4918,0.4969,0.4918,0.515,0.5291,0.5302,0.4878,0.4978,0.5404,0.5772,0.5943,0.5697,0.6802,0.6991,0.7102,0.6968,0.6402,0.6205,0.5647,0.5405,0.4871,0.4632,0.5024,0.5756,0.5874,0.5976,0.5827,0.5699,0.5526,0.5954,0.5688,0.5728,0.5701,0.6179,0.5746,0.5698,0.5451,0.5588,0.6503,0.6614,0.6295,0.6874,0.6722,0.5833,0.5095,0.521,0.5712,0.5802,0.5561,0.5094,0.5355,0.5238,0.5565,0.5967,0.6254,0.6438,0.5936,0.589,0.6216,0.6312,0.6475],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.01680000000000001,0.005400000000000016,0.026400000000000007,0.04760000000000003,0.0539,0.13069999999999998,0.08030000000000001,0.1335,0.1073,0.12789999999999999,0.13030000000000003,0.12150000000000002,0.1597,0.1576,0.1336,0.13959999999999997,0.1454,0.18219999999999997,0.1848,0.1918,0.15869999999999998,0.20619999999999997,0.1815,0.16899999999999998,0.149,0.11829999999999999,0.24050000000000002,0.2846,0.2987,0.2833,0.3055,0.3264,0.3067,0.27390000000000003,0.29490000000000005,0.299,0.3799,0.37400000000000005,0.3433,0.48760000000000003,0.45940000000000003,0.48860000000000003,0.4695,0.4128,0.399,0.3338,0.3063,0.25839999999999996,0.2455,0.2808999999999999,0.3681,0.37470000000000003,0.373,0.3575,0.3701,0.3165,0.39990000000000003,0.35319999999999996,0.33089999999999997,0.3587,0.4013,0.3536,0.3717,0.3296,0.33159999999999995,0.3951,0.4526,0.40829999999999994,0.4333,0.45830000000000004,0.33610000000000007,0.27919999999999995,0.3002,0.3335,0.3380000000000001,0.34320000000000006,0.26059999999999994,0.3358,0.31590000000000007,0.3389,0.3801,0.41529999999999995,0.42450000000000004,0.3932,0.38939999999999997,0.41990000000000005,0.387,0.42529999999999996],"cost_curve":[0.004185,0.00756,0.006601,0.00594,0.0060915,0.0061005,0.0068325,0.0071035,0.00746,0.007824,0.0079105,0.008277,0.033345,0.010996,0.03783,0.012046,0.0101355,0.015907,0.0236945,0.016682,0.012052,0.0170405,0.0125115,0.0178005,0.013516,0.013574,0.01419,0.0149305,0.0156085,0.0153865,0.015662,0.016055,0.0161805,0.0169635,0.0175715,0.0174565,0.0179325,0.0182405,0.0185635,0.018749,0.0189345,0.0194175,0.0201655,0.07176,0.022975,0.0207085,0.020611,0.0210895,0.02126,0.0213055,0.021789,0.022405,0.022315,0.023066,0.0228285,0.0231595,0.023628,0.023946,0.0239915,0.062188,0.0273575,0.025081,0.0254115,0.0257345,0.0257825,0.025978,0.0265965,0.026914,0.0479955,0.0304645,0.0275995,0.02834,0.0287455,0.0285075,0.028858,0.0291835,0.0297665,0.029609,0.0299845,0.030275,0.030753,0.030788,0.031131,0.031614,0.0316995,0.0323625,0.032395,0.032793,0.033001,0.033744]},{"run_name":"icl-gpt-5.4","task":"blind_spectrum_monitoring","run_index":4,"reward":44.63749999999998,"baseline_reward":19.761000000000003,"reference_reward":90.0,"gain":24.87649999999998,"normalized_reward":0.3541693361238055,"normalized_gain":0.3541693361238055,"cost_usd":1.8795105,"latency_seconds":6.095142,"instance_count":90,"reward_curve":[0.2273,0.2437,0.3137,0.3354,0.3178,0.3939,0.3932,0.4088,0.4081,0.4378,0.4213,0.4365,0.4831,0.4576,0.4603,0.4538,0.4091,0.4036,0.4282,0.4209,0.4412,0.4706,0.4478,0.3803,0.355,0.3606,0.3863,0.3867,0.3649,0.3609,0.3735,0.3706,0.3887,0.3837,0.4053,0.434,0.3711,0.514,0.5315,0.5077,0.5,0.5474,0.4708,0.4537,0.4715,0.507,0.4851,0.4877,0.5392,0.5011,0.5335,0.5401,0.5322,0.505,0.4819,0.4565,0.4675,0.5223,0.5118,0.4578,0.5296,0.5403,0.5794,0.5782,0.5604,0.5839,0.5909,0.6132,0.602,0.6874,0.662,0.6913,0.6761,0.6667,0.6635,0.6427,0.6629,0.6528,0.6344,0.6025,0.6219,0.5985,0.6422,0.6333,0.6725,0.6247,0.6391,0.5419,0.5927,0.5899],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2342,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,-0.004500000000000004,0.10199999999999998,0.10899999999999999,0.09370000000000003,0.18109999999999998,0.1659,0.2138,0.18600000000000003,0.2252,0.1809,0.208,0.2638,0.2093,0.2683,0.25639999999999996,0.18520000000000003,0.1766,0.22170000000000004,0.1735,0.23939999999999997,0.26870000000000005,0.23479999999999998,0.17200000000000001,0.1306,0.12729999999999997,0.17689999999999997,0.1762,0.1337,0.1537,0.1753,0.1621,0.1792,0.181,0.1818,0.2201,0.1682,0.2726,0.33419999999999994,0.28740000000000004,0.2736,0.3548,0.2311,0.2321,0.24419999999999997,0.2796,0.26359999999999995,0.25680000000000003,0.30500000000000005,0.2724,0.31579999999999997,0.3186,0.3247,0.2923,0.2573,0.2313,0.26770000000000005,0.2862,0.3163,0.24219999999999997,0.28769999999999996,0.32889999999999997,0.3628,0.3572000000000001,0.3623,0.36839999999999995,0.36369999999999997,0.358,0.3932,0.4662,0.40790000000000004,0.47740000000000005,0.42890000000000006,0.43639999999999995,0.4427,0.405,0.4207000000000001,0.43990000000000007,0.38559999999999994,0.40280000000000005,0.41400000000000003,0.3809,0.4256,0.42319999999999997,0.4532,0.4243,0.4395,0.34020000000000006,0.34850000000000003,0.36769999999999997],"cost_curve":[0.0042225,0.0068075,0.006671,0.00873,0.009708,0.0096085,0.0108265,0.0111625,0.0121735,0.0125585,0.03642,0.013066,0.010867,0.0105205,0.0114795,0.0125005,0.011378,0.0120615,0.011995,0.012011,0.0116745,0.0122055,0.012427,0.0118335,0.0121825,0.0125085,0.012535,0.012754,0.012713,0.0132715,0.014043,0.014164,0.0156755,0.0160635,0.016402,0.0170155,0.015451,0.075784,0.0193315,0.017436,0.017134,0.0186655,0.0180505,0.0188865,0.019107,0.019723,0.019916,0.042145,0.022744,0.0204955,0.020694,0.0212925,0.0212005,0.0215465,0.0220075,0.022053,0.022814,0.023139,0.02354,0.023518,0.0240415,0.024297,0.0248175,0.0254435,0.025476,0.025719,0.026362,0.0262195,0.0269105,0.026998,0.027046,0.027662,0.028245,0.0281425,0.0286355,0.029016,0.0290585,0.029604,0.029769,0.0297965,0.030152,0.03061,0.0309355,0.031281,0.0314765,0.031812,0.0319525,0.0328205,0.0332755,0.033025]},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","run_index":0,"reward":6.175,"baseline_reward":9.45,"reference_reward":19.0,"gain":-3.2749999999999995,"normalized_reward":-0.3429319371727748,"normalized_gain":-0.3429319371727748,"cost_usd":2.086067,"latency_seconds":5.10435,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.0,0.925,0.0,0.875,0.0,0.8,0.875,0.0,0.0,0.95,0.85,0.0,0.0,0.9,0.0,0.0],"baseline_reward_curve":[0.825,0.7,0.8,0.0,0.8,0.0,0.825,0.725,0.5,0.725,0.775,0.725,0.75,0.75,0.0,0.55,0.0,0.0,0.0],"gain_curve":[-0.825,-0.7,-0.8,0.0,0.125,0.0,0.050000000000000044,-0.725,0.30000000000000004,0.15000000000000002,-0.775,-0.725,0.19999999999999996,0.09999999999999998,0.0,-0.55,0.9,0.0,0.0],"cost_curve":[0.040409,0.0375815,0.0265455,0.0994485,0.0361705,0.14218,0.083329,0.0592845,0.1329185,0.125909,0.098005,0.143995,0.06425,0.1590775,0.203855,0.137629,0.144714,0.161967,0.1887985]},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","run_index":1,"reward":14.100000000000001,"baseline_reward":9.45,"reference_reward":19.0,"gain":4.650000000000002,"normalized_reward":0.4869109947643981,"normalized_gain":0.4869109947643981,"cost_usd":6.097326,"latency_seconds":6.490611,"instance_count":19,"reward_curve":[0.9,0.7,0.725,0.0,0.8,0.9,0.75,0.7,0.825,0.65,0.925,0.525,0.675,0.8,0.875,0.9,0.775,0.85,0.825],"baseline_reward_curve":[0.825,0.7,0.8,0.0,0.8,0.0,0.825,0.725,0.5,0.725,0.775,0.725,0.75,0.75,0.0,0.55,0.0,0.0,0.0],"gain_curve":[0.07500000000000007,0.0,-0.07500000000000007,0.0,0.0,0.9,-0.07499999999999996,-0.025000000000000022,0.32499999999999996,-0.07499999999999996,0.15000000000000002,-0.19999999999999996,-0.07499999999999996,0.050000000000000044,0.875,0.35,0.775,0.85,0.825],"cost_curve":[0.0587495,0.159099,0.1978205,0.1513225,0.1512565,0.098948,0.2759055,0.3060175,0.2056575,0.544717,0.142443,0.8223165,0.617424,0.457366,0.286671,0.249876,0.540126,0.393208,0.438402]},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","run_index":2,"reward":4.925,"baseline_reward":9.45,"reference_reward":19.0,"gain":-4.5249999999999995,"normalized_reward":-0.47382198952879573,"normalized_gain":-0.47382198952879573,"cost_usd":3.0839985,"latency_seconds":5.890486,"instance_count":19,"reward_curve":[0.65,0.0,0.0,0.9,0.0,0.0,0.0,0.0,0.875,0.0,0.0,0.0,0.9,0.675,0.925,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.825,0.7,0.8,0.0,0.8,0.0,0.825,0.725,0.5,0.725,0.775,0.725,0.75,0.75,0.0,0.55,0.0,0.0,0.0],"gain_curve":[-0.17499999999999993,-0.7,-0.8,0.9,-0.8,0.0,-0.825,-0.725,0.375,-0.725,-0.775,-0.725,0.15000000000000002,-0.07499999999999996,0.925,-0.55,0.0,0.0,0.0],"cost_curve":[0.067201,0.1104825,0.056716,0.0618755,0.1633875,0.048537,0.089606,0.100887,0.1107585,0.1829445,0.277903,0.284387,0.123103,0.5047585,0.1356405,0.1837345,0.1486045,0.2516465,0.1818255]},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","run_index":3,"reward":11.25,"baseline_reward":9.45,"reference_reward":19.0,"gain":1.8000000000000007,"normalized_reward":0.18848167539267022,"normalized_gain":0.18848167539267022,"cost_usd":3.8420215,"latency_seconds":6.621309,"instance_count":19,"reward_curve":[0.675,0.0,0.875,0.0,0.0,0.95,0.85,0.9,0.9,0.85,0.925,0.9,0.0,0.925,0.85,0.8,0.0,0.85,0.0],"baseline_reward_curve":[0.825,0.7,0.8,0.0,0.8,0.0,0.825,0.725,0.5,0.725,0.775,0.725,0.75,0.75,0.0,0.55,0.0,0.0,0.0],"gain_curve":[-0.1499999999999999,-0.7,0.07499999999999996,0.0,-0.8,0.95,0.025000000000000022,0.17500000000000004,0.4,0.125,0.15000000000000002,0.17500000000000004,-0.75,0.17500000000000004,0.85,0.25,0.0,0.85,0.0],"cost_curve":[0.266978,0.180798,0.1412535,0.077634,0.099309,0.0721775,0.167697,0.1399705,0.126516,0.200534,0.1442505,0.152601,0.3158335,0.1812815,0.28479,0.453902,0.358261,0.3277185,0.150516]},{"run_name":"icl-gpt-5.4","task":"codebase_adaptation","run_index":4,"reward":14.250000000000002,"baseline_reward":9.45,"reference_reward":19.0,"gain":4.8000000000000025,"normalized_reward":0.5026178010471206,"normalized_gain":0.5026178010471206,"cost_usd":2.6853945,"latency_seconds":5.803999,"instance_count":19,"reward_curve":[0.8,0.875,0.9,0.85,0.925,0.925,0.95,0.95,0.925,0.925,0.0,0.95,0.0,0.825,0.0,0.925,0.825,0.8,0.9],"baseline_reward_curve":[0.825,0.7,0.8,0.0,0.8,0.0,0.825,0.725,0.5,0.725,0.775,0.725,0.75,0.75,0.0,0.55,0.0,0.0,0.0],"gain_curve":[-0.02499999999999991,0.17500000000000004,0.09999999999999998,0.85,0.125,0.925,0.125,0.22499999999999998,0.42500000000000004,0.20000000000000007,-0.775,0.22499999999999998,-0.75,0.07499999999999996,0.0,0.375,0.825,0.8,0.9],"cost_curve":[0.0472925,0.1171815,0.0554635,0.130504,0.096594,0.0728055,0.0524725,0.063426,0.081636,0.084956,0.109649,0.070406,0.215388,0.238144,0.131459,0.2475335,0.273635,0.39698,0.1998685]},{"run_name":"icl-gpt-5.4","task":"cohort_studies","run_index":0,"reward":0.1525,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain":-0.842,"normalized_reward":-0.37429874552130654,"normalized_gain":-0.37429874552130654,"cost_usd":3.964319,"latency_seconds":15.901662,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0265,0.0172,0.0,0.0,0.0,0.0,0.0,0.1088,0.0],"baseline_reward_curve":[0.0,0.0869,0.0,0.0132,0.0,0.006,0.1057,0.0,0.0,0.0,0.0,0.0,0.1462,0.0,0.0941,0.0,0.1199,0.2244,0.0,0.1981],"gain_curve":[0.0,-0.0869,0.0,-0.0132,0.0,-0.006,-0.1057,0.0,0.0,0.0,0.0,0.0265,-0.129,0.0,-0.0941,0.0,-0.1199,-0.2244,0.1088,-0.1981],"cost_curve":[0.2711895,0.167844,0.138968,0.1188715,0.2215405,0.142614,0.1406215,0.1436585,0.26965,0.1595365,0.1626445,0.165852,0.316815,0.182656,0.1842155,0.185632,0.361829,0.2053525,0.209797,0.2150315]},{"run_name":"icl-gpt-5.4","task":"cohort_studies","run_index":1,"reward":1.6822000000000001,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain":0.6877000000000002,"normalized_reward":0.30570694453088193,"normalized_gain":0.30570694453088193,"cost_usd":3.0948965,"latency_seconds":9.483449,"instance_count":20,"reward_curve":[0.0,0.0,0.2897,0.0,0.1043,0.0343,0.3057,0.1879,0.0,0.0,0.0,0.0,0.0158,0.0,0.1366,0.1338,0.1503,0.0842,0.0638,0.1758],"baseline_reward_curve":[0.0,0.0869,0.0,0.0132,0.0,0.006,0.1057,0.0,0.0,0.0,0.0,0.0,0.1462,0.0,0.0941,0.0,0.1199,0.2244,0.0,0.1981],"gain_curve":[0.0,-0.0869,0.2897,-0.0132,0.1043,0.0283,0.2,0.1879,0.0,0.0,0.0,0.0,-0.1304,0.0,0.042499999999999996,0.1338,0.030399999999999983,-0.1402,0.0638,-0.022299999999999986],"cost_curve":[0.193528,0.180096,0.055885,0.115277,0.267671,0.064775,0.1771045,0.069689,0.2339165,0.2254605,0.080314,0.081019,0.2386515,0.086713,0.2147425,0.090828,0.274296,0.245175,0.099382,0.100373]},{"run_name":"icl-gpt-5.4","task":"cohort_studies","run_index":2,"reward":0.6676,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain":-0.32689999999999997,"normalized_reward":-0.14531859846902032,"normalized_gain":-0.14531859846902032,"cost_usd":3.753883,"latency_seconds":8.229981,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0945,0.0289,0.0,0.0,0.0,0.0,0.0,0.0,0.2652,0.279,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0869,0.0,0.0132,0.0,0.006,0.1057,0.0,0.0,0.0,0.0,0.0,0.1462,0.0,0.0941,0.0,0.1199,0.2244,0.0,0.1981],"gain_curve":[0.0,-0.0869,0.0,-0.0132,0.0945,0.022899999999999997,-0.1057,0.0,0.0,0.0,0.0,0.0,0.119,0.279,-0.0941,0.0,-0.1199,-0.2244,0.0,-0.1981],"cost_curve":[0.2128315,0.2440405,0.058553,0.059666,0.2652885,0.06738,0.2780745,0.074281,0.269107,0.081058,0.2920825,0.087232,0.321654,0.093138,0.3459195,0.099243,0.3345915,0.3498815,0.109409,0.110452]},{"run_name":"icl-gpt-5.4","task":"cohort_studies","run_index":3,"reward":0.2033,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain":-0.7911999999999999,"normalized_reward":-0.35171635089840586,"normalized_gain":-0.35171635089840586,"cost_usd":4.868407,"latency_seconds":11.826722,"instance_count":20,"reward_curve":[0.0,0.0544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0645,0.0844,0.0,0.0],"baseline_reward_curve":[0.0,0.0869,0.0,0.0132,0.0,0.006,0.1057,0.0,0.0,0.0,0.0,0.0,0.1462,0.0,0.0941,0.0,0.1199,0.2244,0.0,0.1981],"gain_curve":[0.0,-0.03250000000000001,0.0,-0.0132,0.0,-0.006,-0.1057,0.0,0.0,0.0,0.0,0.0,-0.1462,0.0,-0.0941,0.0,-0.055400000000000005,-0.13999999999999999,0.0,-0.1981],"cost_curve":[0.1467675,0.131523,0.126179,0.145538,0.2387935,0.2064865,0.2205275,0.2374905,0.2846865,0.271296,0.2298265,0.2377085,0.298937,0.276377,0.2811085,0.287491,0.331667,0.296734,0.305603,0.3136665]},{"run_name":"icl-gpt-5.4","task":"cohort_studies","run_index":4,"reward":2.0799,"baseline_reward":0.9944999999999999,"reference_reward":3.24404,"gain":1.0854,"normalized_reward":0.4824986441672519,"normalized_gain":0.4824986441672519,"cost_usd":2.989171,"latency_seconds":10.639118,"instance_count":20,"reward_curve":[0.0,0.0,0.2538,0.0,0.0,0.0185,0.0811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3211,0.5576,0.5478,0.3],"baseline_reward_curve":[0.0,0.0869,0.0,0.0132,0.0,0.006,0.1057,0.0,0.0,0.0,0.0,0.0,0.1462,0.0,0.0941,0.0,0.1199,0.2244,0.0,0.1981],"gain_curve":[0.0,-0.0869,0.2538,-0.0132,0.0,0.012499999999999999,-0.024599999999999997,0.0,0.0,0.0,0.0,0.0,-0.1462,0.0,-0.0941,0.0,0.2012,0.3332,0.5478,0.10189999999999999],"cost_curve":[0.246403,0.186875,0.058727,0.057454,0.226687,0.158539,0.06828,0.069213,0.2732945,0.2004205,0.08045,0.081531,0.269284,0.186732,0.089666,0.090702,0.241935,0.203702,0.099414,0.099862]},{"run_name":"icl-gpt-5.4","task":"database_exploration","run_index":0,"reward":15.466666666666669,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain":9.933333333333337,"normalized_reward":0.2882011605415862,"normalized_gain":0.2882011605415862,"cost_usd":1.1901525,"latency_seconds":3.961683,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.7333333333333334,0.0,0.8,0.0,0.8,0.6,0.0,0.8666666666666667,0.0,0.0,0.8,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.7333333333333334,0.8666666666666667,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.0,0.9333333333333333,0.8666666666666667],"baseline_reward_curve":[0.4,0.0,0.0,0.6,0.0,0.4,0.0,0.0,0.6,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.0,0.06666666666666665],"gain_curve":[-0.4,0.0,0.0,-0.6,0.0,0.5333333333333333,0.7333333333333334,0.0,0.20000000000000007,0.0,0.2666666666666667,0.0,0.0,0.33333333333333337,0.0,0.0,0.8,-0.4666666666666667,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.2666666666666667,0.8666666666666667,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,-0.33333333333333337,0.4,0.0,0.9333333333333333,0.8],"cost_curve":[0.0390755,0.0122295,0.03071,0.016613,0.0093075,0.00899,0.024906,0.0226345,0.0222425,0.017961,0.0248285,0.046455,0.0313085,0.0241945,0.0157475,0.015862,0.0336755,0.0162475,0.017061,0.0359165,0.02781,0.0199985,0.01864,0.051038,0.0299425,0.0545305,0.0220835,0.0214955,0.0463635,0.0372295,0.0492035,0.0242525,0.1184505,0.0256425,0.0262325,0.0265895,0.025979,0.0267715,0.0265385,0.045395]},{"run_name":"icl-gpt-5.4","task":"database_exploration","run_index":1,"reward":7.466666666666668,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain":1.9333333333333353,"normalized_reward":0.05609284332688594,"normalized_gain":0.05609284332688594,"cost_usd":0.9127905,"latency_seconds":3.787689,"instance_count":40,"reward_curve":[0.33333333333333337,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.8,0.0],"baseline_reward_curve":[0.4,0.0,0.0,0.6,0.0,0.4,0.0,0.0,0.6,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.0,0.06666666666666665],"gain_curve":[-0.06666666666666665,0.0,0.0,-0.6,0.0,-0.4,0.0,0.8,-0.6,0.0,-0.5333333333333333,-0.6,0.0,-0.5333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,-0.4666666666666667,0.0,0.0,0.0,0.8666666666666667,0.0,-0.4666666666666667,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,-0.33333333333333337,-0.5333333333333333,0.9333333333333333,0.8,-0.06666666666666665],"cost_curve":[0.027633,0.0255515,0.019465,0.0294175,0.0141535,0.0200035,0.0113515,0.0218325,0.024411,0.0118975,0.012803,0.018625,0.013573,0.012965,0.014996,0.013465,0.0140285,0.014766,0.0148485,0.015736,0.0318835,0.0243925,0.053053,0.0394285,0.029209,0.0202055,0.0192025,0.042708,0.0217965,0.0217835,0.021531,0.0228235,0.0226055,0.0228125,0.022725,0.0243675,0.024149,0.023274,0.0484925,0.024825]},{"run_name":"icl-gpt-5.4","task":"database_exploration","run_index":2,"reward":14.800000000000004,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain":9.266666666666673,"normalized_reward":0.26885880077369456,"normalized_gain":0.26885880077369456,"cost_usd":0.989738,"latency_seconds":3.590275,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.9333333333333333,0.7333333333333334,0.7333333333333334,0.0,0.8,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.8,0.0,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0,0.0,0.8,0.0,0.0,0.0,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8,0.0,0.8666666666666667,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.4,0.0,0.0,0.6,0.0,0.4,0.0,0.0,0.6,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.0,0.06666666666666665],"gain_curve":[-0.4,0.0,0.0,0.33333333333333337,0.7333333333333334,0.33333333333333337,0.0,0.8,-0.6,0.0,-0.5333333333333333,0.33333333333333337,0.0,-0.5333333333333333,0.8,0.0,0.9333333333333333,-0.4666666666666667,0.9333333333333333,0.9333333333333333,0.0,0.0,0.8,-0.4666666666666667,0.0,0.0,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8,0.0,0.8666666666666667,0.0,0.0,0.0,0.6,-0.5333333333333333,0.0,0.0,0.8666666666666667],"cost_curve":[0.02186,0.056439,0.0084685,0.008636,0.024886,0.025966,0.011176,0.0224495,0.0114595,0.012367,0.0127895,0.0141795,0.013886,0.020979,0.0280025,0.030541,0.0149975,0.015328,0.016273,0.015848,0.0439135,0.028068,0.036714,0.0321165,0.019706,0.0295075,0.030768,0.0221995,0.021429,0.0435125,0.047655,0.035497,0.023973,0.0237555,0.0244955,0.0243975,0.024072,0.0398365,0.0259915,0.0255985]},{"run_name":"icl-gpt-5.4","task":"database_exploration","run_index":3,"reward":14.000000000000002,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain":8.466666666666669,"normalized_reward":0.2456479690522244,"normalized_gain":0.2456479690522244,"cost_usd":0.982269,"latency_seconds":3.547977,"instance_count":40,"reward_curve":[0.5333333333333333,0.5333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.7333333333333334,0.9333333333333333,0.8,0.5333333333333333,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.9333333333333333,0.8666666666666667],"baseline_reward_curve":[0.4,0.0,0.0,0.6,0.0,0.4,0.0,0.0,0.6,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.0,0.06666666666666665],"gain_curve":[0.1333333333333333,0.5333333333333333,0.0,-0.6,0.0,-0.4,0.9333333333333333,0.0,-0.6,0.7333333333333334,0.4,0.20000000000000007,0.5333333333333333,-0.5333333333333333,0.8,0.0,0.0,-0.4666666666666667,0.0,0.0,0.0,0.0,0.8,-0.4666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.0,0.6,0.4,0.9333333333333333,0.9333333333333333,0.8],"cost_curve":[0.020123,0.024778,0.0138235,0.0067815,0.0074725,0.0546735,0.0098765,0.010892,0.011637,0.0295865,0.0132505,0.025057,0.0547835,0.022163,0.031092,0.0321665,0.0162645,0.016577,0.016965,0.0175145,0.0559645,0.027975,0.0401795,0.0199995,0.0201345,0.0211895,0.0221095,0.0219585,0.0223585,0.0239305,0.023057,0.035316,0.024402,0.023559,0.023822,0.024677,0.0257645,0.025504,0.025709,0.0391815]},{"run_name":"icl-gpt-5.4","task":"database_exploration","run_index":4,"reward":17.66666666666667,"baseline_reward":5.533333333333332,"reference_reward":40.0,"gain":12.13333333333334,"normalized_reward":0.3520309477756288,"normalized_gain":0.3520309477756288,"cost_usd":1.0840885,"latency_seconds":3.978777,"instance_count":40,"reward_curve":[0.6,0.6,0.0,0.0,0.9333333333333333,0.6666666666666667,0.0,0.8,0.9333333333333333,0.8666666666666667,0.7333333333333334,0.0,0.0,0.9333333333333333,0.0,0.0,0.9333333333333333,0.0,0.9333333333333333,0.8,0.0,0.7333333333333334,0.0,0.0,0.8,0.8,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.9333333333333333,0.0,0.9333333333333333,0.9333333333333333,0.0],"baseline_reward_curve":[0.4,0.0,0.0,0.6,0.0,0.4,0.0,0.0,0.6,0.0,0.5333333333333333,0.6,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.0,0.06666666666666665],"gain_curve":[0.19999999999999996,0.6,0.0,-0.6,0.9333333333333333,0.2666666666666667,0.0,0.8,0.33333333333333337,0.8666666666666667,0.20000000000000007,-0.6,0.0,0.4,0.0,0.0,0.9333333333333333,-0.4666666666666667,0.9333333333333333,0.8,0.0,0.7333333333333334,0.0,-0.4666666666666667,0.8,0.8,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.9333333333333333,0.6,-0.5333333333333333,0.9333333333333333,0.9333333333333333,-0.06666666666666665],"cost_curve":[0.0240145,0.0246325,0.0178975,0.0176725,0.008813,0.028907,0.0304875,0.0221575,0.012258,0.018512,0.034279,0.0270405,0.014414,0.013999,0.023642,0.015425,0.01684,0.0161375,0.0169925,0.037597,0.018108,0.047357,0.019314,0.0404275,0.0406645,0.042631,0.022135,0.022165,0.05804,0.0237065,0.022636,0.0239015,0.0412045,0.038182,0.0255095,0.026617,0.0661225,0.026918,0.028393,0.0283375]},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","run_index":0,"reward":127.5,"baseline_reward":133.6,"reference_reward":1138.5,"gain":-6.099999999999994,"normalized_reward":-0.006070255746840476,"normalized_gain":-0.006070255746840476,"cost_usd":4.500986,"latency_seconds":6.435621,"instance_count":120,"reward_curve":[-2.0,-0.5,-2.0,13.0,-3.0,1.0,3.0,-3.0,30.0,-32.0,5.0,-17.5,6.0,5.0,-3.0,-4.5,4.0,-7.0,7.0,32.5,-1.0,4.0,-0.5,1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,1.0,-7.0,0.5,0.5,-3.0,2.0,1.0,42.0,-1.0,-2.0,-0.5,-0.5,2.0,-3.0,10.0,-1.0,-0.5,2.0,5.0,2.0,-4.0,-5.0,2.0,14.0,-0.5,-4.0,-3.0,-2.0,-0.5,0.5,0.5,-2.0,1.0,1.0,1.0,-0.5,1.0,-2.0,2.0,10.0,0.5,-0.5,-5.0,3.0,-0.5,0.5,-3.0,1.0,1.0,0.5,11.0,3.0,3.0,-3.0,1.0,3.0,1.0,1.0,0.5,1.0,-8.0,-5.0,0.5,0.5,3.0,-1.0,4.0,3.0,0.5,-1.0,-2.0,1.0,1.0,-1.0,0.5,-43.0,-1.0,1.0,3.0,2.0,2.0,3.0,-1.0,2.0,49.0,3.0,0.5,-5.0],"baseline_reward_curve":[-1.0,3.0,-1.0,23.0,-4.0,2.0,6.5,-5.0,13.0,-10.0,8.0,-7.0,14.0,16.0,-4.0,-4.2,4.0,-4.0,6.5,9.0,2.0,-5.0,2.0,2.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-1.0,1.0,2.0,-6.0,10.0,-0.5,-0.5,2.0,3.0,2.0,-7.0,-7.0,2.5,10.0,4.0,-2.0,-9.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,4.0,1.0,-11.2,2.0,20.0,0.5,-3.0,-4.0,4.0,-0.5,0.5,-2.0,4.0,-2.0,0.5,5.0,-2.0,3.0,-2.0,1.0,15.0,1.0,-4.0,0.5,-4.0,-0.5,-2.0,0.5,0.5,1.0,-1.0,4.0,1.0,0.5,2.0,-2.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,6.0,-1.0,2.0,5.0,-1.0,2.0,16.0,5.0,0.5,-4.0],"gain_curve":[-1.0,-3.5,-1.0,-10.0,1.0,-1.0,-3.5,2.0,17.0,-22.0,-3.0,-10.5,-8.0,-11.0,1.0,-0.2999999999999998,0.0,-3.0,0.5,23.5,-3.0,9.0,-2.5,-1.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,2.0,-4.0,0.0,0.0,-1.0,2.0,0.0,32.0,0.0,0.0,0.5,-1.5,0.0,3.0,0.0,-0.5,0.0,0.0,2.0,0.0,3.0,2.0,-0.5,4.0,-4.5,-2.0,6.0,0.0,-1.5,0.0,0.0,0.0,0.0,0.0,0.0,-4.5,0.0,9.2,0.0,-10.0,0.0,2.5,-1.0,-1.0,0.0,0.0,-1.0,-3.0,3.0,0.0,6.0,5.0,0.0,-1.0,0.0,-12.0,0.0,5.0,0.0,5.0,-7.5,-3.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,-3.0,0.0,0.0,2.0,-3.0,0.0,-32.0,-3.0,-2.0,-3.0,3.0,0.0,-2.0,0.0,0.0,33.0,-2.0,0.0,-1.0],"cost_curve":[0.013525,0.0034085,0.0120055,0.0142325,0.0155615,0.0150625,0.0168925,0.01842,0.02041,0.021684,0.0225705,0.0245155,0.0254545,0.026576,0.027581,0.030711,0.030789,0.0319585,0.0335425,0.0344395,0.008979,0.043448,0.008959,0.018246,0.018491,0.00945,0.0,0.009816,0.0,0.01973,0.029303,0.0,0.010138,0.052685,0.0,0.0,0.0219515,0.0329135,0.022113,0.0691585,0.034863,0.0355275,0.012272,0.012243,0.0367505,0.025027,0.0508445,0.0260755,0.0131285,0.0400595,0.0550185,0.0552725,0.057148,0.0584015,0.059844,0.0610415,0.015401,0.0629505,0.063453,0.0646585,0.0164325,0.0,0.0,0.066487,0.0337545,0.0171705,0.0341565,0.0171765,0.01735,0.070029,0.071272,0.0722145,0.0,0.018764,0.074423,0.0565875,0.018916,0.0,0.0772505,0.0195995,0.0196055,0.0,0.0793355,0.0400565,0.0400715,0.0823935,0.0206015,0.041276,0.041796,0.0418235,0.0,0.0426785,0.0857815,0.086959,0.0,0.0,0.0445405,0.04421,0.066982,0.0452505,0.0,0.0455725,0.068515,0.069122,0.023381,0.0233745,0.0,0.167191,0.024235,0.024078,0.072914,0.098197,0.0744155,0.126888,0.0509285,0.0768315,0.209218,0.0530975,0.0,0.1333805]},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","run_index":1,"reward":63.99999999999998,"baseline_reward":133.6,"reference_reward":1138.5,"gain":-69.60000000000002,"normalized_reward":-0.069260622947557,"normalized_gain":-0.069260622947557,"cost_usd":4.9077535,"latency_seconds":6.6597,"instance_count":120,"reward_curve":[13.0,5.2,2.2,-4.6,5.2,27.6,-11.8,-4.8,5.6,9.4,-4.8,15.4,5.6,-5.6,4.6,-8.8,-3.0,-2.2,-27.4,2.2,-1.0,-0.5,2.0,1.0,-0.5,0.5,10.0,-3.0,2.0,0.5,-1.0,-1.0,0.5,-1.0,-1.0,0.5,-1.0,2.0,1.0,0.5,3.0,-0.5,-7.0,1.0,2.0,10.0,1.0,-6.6,-0.5,1.0,-7.6,15.8,-4.8,-4.8,-7.0,4.0,2.0,3.8,6.0,-2.2,6.4,-0.5,-7.2,3.0,1.0,-0.5,0.5,0.5,-3.6,3.0,0.5,1.0,1.0,1.0,1.0,-3.6,1.0,3.0,-3.6,-2.0,0.5,0.5,1.0,0.5,-6.0,3.0,8.6,-4.8,-2.0,1.0,0.5,1.0,1.0,1.0,2.0,-4.8,0.5,-1.0,4.0,1.0,-15.0,1.0,-1.0,0.5,-1.0,2.0,-2.0,1.0,0.5,3.0,-1.0,2.0,12.0,-4.0,4.0,0.5,3.0,3.0,3.0,1.0],"baseline_reward_curve":[-1.0,3.0,-1.0,23.0,-4.0,2.0,6.5,-5.0,13.0,-10.0,8.0,-7.0,14.0,16.0,-4.0,-4.2,4.0,-4.0,6.5,9.0,2.0,-5.0,2.0,2.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-1.0,1.0,2.0,-6.0,10.0,-0.5,-0.5,2.0,3.0,2.0,-7.0,-7.0,2.5,10.0,4.0,-2.0,-9.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,4.0,1.0,-11.2,2.0,20.0,0.5,-3.0,-4.0,4.0,-0.5,0.5,-2.0,4.0,-2.0,0.5,5.0,-2.0,3.0,-2.0,1.0,15.0,1.0,-4.0,0.5,-4.0,-0.5,-2.0,0.5,0.5,1.0,-1.0,4.0,1.0,0.5,2.0,-2.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,6.0,-1.0,2.0,5.0,-1.0,2.0,16.0,5.0,0.5,-4.0],"gain_curve":[14.0,2.2,3.2,-27.6,9.2,25.6,-18.3,0.20000000000000018,-7.4,19.4,-12.8,22.4,-8.4,-21.6,8.6,-4.6000000000000005,-7.0,1.7999999999999998,-33.9,-6.8,-3.0,4.5,0.0,-1.0,-1.5,1.0,9.5,-2.0,1.5,1.5,-3.0,-1.5,1.5,2.0,-1.5,0.0,1.0,2.0,0.0,-9.5,4.0,1.5,-6.0,0.0,0.0,16.0,-9.0,-6.1,0.0,-1.0,-10.6,13.8,2.2,2.2,-9.5,-6.0,-2.0,5.8,15.0,-0.20000000000000018,5.4,-1.0,-7.7,5.0,0.0,-1.5,-0.5,-3.5,-4.6,14.2,-1.5,-19.0,0.5,4.0,5.0,-7.6,1.5,2.5,-1.6,-6.0,2.5,0.0,-4.0,2.5,-9.0,5.0,7.6,-19.8,-3.0,5.0,0.0,5.0,1.5,3.0,1.5,-5.3,-0.5,0.0,0.0,0.0,-15.5,-1.0,1.0,-0.5,0.0,0.0,-2.5,12.0,-1.5,0.0,-7.0,3.0,10.0,-9.0,5.0,-1.5,-13.0,-2.0,2.5,5.0],"cost_curve":[0.01359,0.015911,0.0124605,0.014478,0.015388,0.0175205,0.019439,0.0200875,0.0206815,0.0230615,0.02316,0.025015,0.0266895,0.0284435,0.0294125,0.0308415,0.0308455,0.0325855,0.0349075,0.035159,0.0095,0.0089605,0.027598,0.018767,0.009633,0.0,0.0491885,0.040483,0.032055,0.0,0.0110365,0.021826,0.0,0.022043,0.0221275,0.0,0.0334585,0.034025,0.0233675,0.0,0.0491825,0.012277,0.0631545,0.025722,0.0389955,0.0537425,0.026754,0.082055,0.0143085,0.0142565,0.0571075,0.0586305,0.0595755,0.0608705,0.061783,0.0645035,0.065038,0.0664105,0.0679105,0.0691005,0.06998,0.017775,0.0726065,0.0362855,0.0367055,0.018451,0.0,0.0,0.075739,0.037744,0.0,0.038151,0.038138,0.038773,0.0386255,0.079654,0.020124,0.0401875,0.081548,0.0832035,0.0,0.0,0.0213475,0.0,0.08442,0.0429515,0.0863875,0.088158,0.089373,0.0451865,0.0,0.0228275,0.022736,0.0228895,0.0919695,0.117001,0.0,0.0476155,0.1215725,0.024188,0.1466895,0.0496855,0.0493825,0.0,0.0250895,0.0755675,0.076652,0.0768155,0.0,0.051825,0.0259895,0.0784075,0.159307,0.1345505,0.082246,0.0,0.0549965,0.055211,0.139528,0.028372]},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","run_index":2,"reward":21.0,"baseline_reward":133.6,"reference_reward":1138.5,"gain":-112.6,"normalized_reward":-0.11205095034331775,"normalized_gain":-0.11205095034331775,"cost_usd":4.5701645,"latency_seconds":6.479726,"instance_count":120,"reward_curve":[4.0,-3.0,-7.0,1.0,-2.0,4.0,-3.0,-9.0,-7.0,2.0,8.0,-1.0,-2.0,7.0,2.0,1.0,3.0,-1.0,4.0,3.0,0.0,-0.5,1.0,-2.0,-0.5,0.5,1.0,-4.0,0.5,0.5,-1.0,1.0,1.0,3.0,-1.0,0.5,-1.0,2.0,4.0,1.0,-1.0,4.0,-1.0,4.0,-1.0,0.5,2.0,-1.0,-1.0,2.0,3.0,1.0,-3.0,-4.0,-1.0,1.0,5.0,-6.0,4.0,-3.0,0.5,2.0,0.5,-2.0,0.5,1.0,-0.5,-7.0,-6.0,2.0,1.0,8.0,1.0,-2.0,-0.5,2.0,1.0,1.0,-2.0,1.0,1.0,1.0,0.5,1.0,5.0,0.5,0.5,-4.0,-2.0,1.0,1.0,-4.0,-2.0,0.5,2.0,-1.0,2.0,-1.0,1.0,-1.0,0.5,-3.0,-6.0,2.0,-2.0,-1.0,2.0,0.5,-1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,0.5,2.0,0.5,2.0,3.0],"baseline_reward_curve":[-1.0,3.0,-1.0,23.0,-4.0,2.0,6.5,-5.0,13.0,-10.0,8.0,-7.0,14.0,16.0,-4.0,-4.2,4.0,-4.0,6.5,9.0,2.0,-5.0,2.0,2.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-1.0,1.0,2.0,-6.0,10.0,-0.5,-0.5,2.0,3.0,2.0,-7.0,-7.0,2.5,10.0,4.0,-2.0,-9.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,4.0,1.0,-11.2,2.0,20.0,0.5,-3.0,-4.0,4.0,-0.5,0.5,-2.0,4.0,-2.0,0.5,5.0,-2.0,3.0,-2.0,1.0,15.0,1.0,-4.0,0.5,-4.0,-0.5,-2.0,0.5,0.5,1.0,-1.0,4.0,1.0,0.5,2.0,-2.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,6.0,-1.0,2.0,5.0,-1.0,2.0,16.0,5.0,0.5,-4.0],"gain_curve":[5.0,-6.0,-6.0,-22.0,2.0,2.0,-9.5,-4.0,-20.0,12.0,0.0,6.0,-16.0,-9.0,6.0,5.2,-1.0,3.0,-2.5,-6.0,-2.0,4.5,-1.0,-4.0,-1.5,1.0,0.5,-3.0,0.0,1.5,-3.0,0.5,2.0,6.0,-1.5,0.0,1.0,2.0,3.0,-9.0,0.0,6.0,0.0,3.0,-3.0,6.5,-8.0,-0.5,-0.5,0.0,0.0,-1.0,4.0,3.0,-3.5,-9.0,1.0,-4.0,13.0,-1.0,-0.5,1.5,0.0,0.0,-0.5,0.0,-1.5,-11.0,-7.0,13.2,-1.0,-12.0,0.5,1.0,3.5,-2.0,1.5,0.5,0.0,-3.0,3.0,0.5,-4.5,3.0,2.0,2.5,-0.5,-19.0,-3.0,5.0,0.5,0.0,-1.5,2.5,1.5,-1.5,1.0,0.0,-3.0,-2.0,0.0,-5.0,-4.0,1.0,-1.0,-3.0,1.5,11.5,-3.0,3.0,-7.0,2.0,-3.0,-6.0,0.0,-1.5,-14.0,-4.5,1.5,7.0],"cost_curve":[0.0133075,0.0144565,0.014139,0.0153055,0.0146285,0.0166995,0.017511,0.0195235,0.020541,0.022161,0.0232555,0.023993,0.025003,0.026996,0.0278385,0.0286235,0.030696,0.0310585,0.0326365,0.0343595,0.035522,0.00918,0.009231,0.0185075,0.0093245,0.0,0.0193195,0.0495695,0.0,0.0,0.010822,0.0201125,0.020395,0.0423505,0.0325085,0.0,0.022356,0.0331965,0.047575,0.023396,0.024016,0.036676,0.0246815,0.050601,0.025672,0.0,0.0388805,0.0132055,0.026639,0.055061,0.055196,0.056859,0.057829,0.0593645,0.059783,0.061536,0.063391,0.0648235,0.0659235,0.067311,0.0,0.034588,0.0,0.069476,0.0,0.0350165,0.017371,0.071465,0.07296,0.0734825,0.01883,0.0763415,0.0381155,0.0772565,0.019676,0.0392075,0.03926,0.0198895,0.0799245,0.0617415,0.020557,0.0206405,0.0,0.0207515,0.0833575,0.0,0.0,0.086088,0.087124,0.043928,0.043475,0.0900945,0.090326,0.0,0.0460805,0.04614,0.046545,0.023467,0.046957,0.023698,0.0,0.0951745,0.1209985,0.073839,0.073858,0.075055,0.050277,0.0,0.050429,0.1020985,0.05132,0.077557,0.0519355,0.0522905,0.0524555,0.0,0.0534105,0.0,0.05373,0.108357]},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","run_index":3,"reward":91.0,"baseline_reward":133.6,"reference_reward":1138.5,"gain":-42.599999999999994,"normalized_reward":-0.0423922778385909,"normalized_gain":-0.0423922778385909,"cost_usd":4.38188,"latency_seconds":6.124409,"instance_count":120,"reward_curve":[4.0,24.0,4.0,7.0,-5.0,-28.0,4.0,-3.0,-8.0,3.5,-5.0,2.0,-4.0,1.0,-2.0,-2.0,24.0,-8.0,5.0,24.0,-0.5,-1.0,-1.0,1.0,2.0,-2.0,-1.0,-1.0,2.0,-2.0,1.0,0.5,0.5,1.0,0.5,-1.0,2.0,-5.0,-1.0,8.0,32.0,-0.5,-7.0,-0.5,0.5,-1.0,-0.5,-5.0,0.5,1.0,-5.0,-3.0,10.0,9.0,-0.5,2.0,-8.0,-5.0,4.0,-2.0,1.0,3.0,0.5,12.0,3.0,-0.5,-2.0,2.0,1.0,0.5,1.0,1.0,0.5,-2.0,1.0,-2.0,-0.5,-0.5,0.5,-8.0,8.0,0.5,0.5,-4.0,-4.0,1.0,1.0,1.0,0.5,-0.5,1.0,-2.0,3.0,1.0,3.0,-20.0,14.0,-1.0,0.5,0.5,-1.0,3.0,4.0,0.5,3.0,2.0,3.0,1.0,4.0,-1.0,0.5,2.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-4.0,1.0],"baseline_reward_curve":[-1.0,3.0,-1.0,23.0,-4.0,2.0,6.5,-5.0,13.0,-10.0,8.0,-7.0,14.0,16.0,-4.0,-4.2,4.0,-4.0,6.5,9.0,2.0,-5.0,2.0,2.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-1.0,1.0,2.0,-6.0,10.0,-0.5,-0.5,2.0,3.0,2.0,-7.0,-7.0,2.5,10.0,4.0,-2.0,-9.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,4.0,1.0,-11.2,2.0,20.0,0.5,-3.0,-4.0,4.0,-0.5,0.5,-2.0,4.0,-2.0,0.5,5.0,-2.0,3.0,-2.0,1.0,15.0,1.0,-4.0,0.5,-4.0,-0.5,-2.0,0.5,0.5,1.0,-1.0,4.0,1.0,0.5,2.0,-2.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,6.0,-1.0,2.0,5.0,-1.0,2.0,16.0,5.0,0.5,-4.0],"gain_curve":[5.0,21.0,5.0,-16.0,-1.0,-30.0,-2.5,2.0,-21.0,13.5,-13.0,9.0,-18.0,-15.0,2.0,2.2,20.0,-4.0,-1.5,15.0,-2.5,4.0,-3.0,-1.0,1.0,-1.5,-1.5,0.0,1.5,-1.0,-1.0,0.0,1.5,4.0,0.0,-1.5,4.0,-5.0,-2.0,-2.0,33.0,1.5,-6.0,-1.5,-1.5,5.0,-10.5,-4.5,1.0,-1.0,-8.0,-5.0,17.0,16.0,-3.0,-8.0,-12.0,-3.0,13.0,0.0,0.0,2.5,0.0,14.0,2.0,-1.5,-3.0,-2.0,0.0,11.7,-1.0,-19.0,0.0,1.0,5.0,-6.0,0.0,-1.0,2.5,-12.0,10.0,0.0,-4.5,-2.0,-7.0,3.0,0.0,-14.0,-0.5,3.5,0.5,2.0,3.5,3.0,2.5,-20.5,13.0,0.0,-3.5,-0.5,-1.5,1.0,6.0,-0.5,4.0,0.0,2.5,12.0,2.0,-4.0,-5.5,3.0,-3.0,-6.0,0.0,-1.0,-17.0,-6.0,-4.5,5.0],"cost_curve":[0.0142675,0.0121235,0.0132905,0.015415,0.0156685,0.0174535,0.0184875,0.018669,0.0208295,0.0220615,0.0234365,0.025234,0.0266105,0.027422,0.0289145,0.029639,0.0314015,0.033401,0.03356,0.0351525,0.0092085,0.0180365,0.0271355,0.018512,0.02872,0.029447,0.010421,0.0203475,0.030993,0.031925,0.010932,0.0,0.0,0.0224865,0.0,0.0220605,0.0337345,0.0819665,0.0241265,0.036591,0.063485,0.013008,0.065522,0.0133655,0.0,0.026783,0.0134835,0.0684355,0.0,0.028518,0.0567895,0.0584475,0.059945,0.0618675,0.015623,0.0627135,0.0638785,0.0654215,0.0665915,0.0670915,0.034302,0.034229,0.0,0.0703885,0.0355915,0.017929,0.0726525,0.0736525,0.037509,0.0,0.01888,0.0189035,0.0,0.076357,0.0194045,0.0783745,0.019788,0.019759,0.0,0.0803745,0.081917,0.0,0.0,0.083668,0.08437,0.0427225,0.0430275,0.021607,0.0,0.021768,0.0433935,0.088748,0.0444735,0.044358,0.045081,0.1149885,0.093258,0.023548,0.0,0.0,0.0240365,0.0472325,0.0717345,0.0,0.0483125,0.097182,0.049118,0.024746,0.074947,0.050433,0.0,0.051112,0.0506895,0.0766775,0.0517725,0.051489,0.025914,0.052194,0.131743,0.0268725]},{"run_name":"icl-gpt-5.4","task":"exploitable_poker","run_index":4,"reward":175.3,"baseline_reward":133.6,"reference_reward":1138.5,"gain":41.70000000000002,"normalized_reward":0.04149666633495872,"normalized_gain":0.04149666633495872,"cost_usd":4.6898905,"latency_seconds":6.543822,"instance_count":120,"reward_curve":[13.5,-9.0,33.0,29.5,-3.0,5.0,3.0,2.0,1.0,-2.0,-11.2,-5.0,5.0,34.5,-35.0,-2.0,-5.0,2.0,5.0,-1.0,1.0,0.5,10.0,1.0,-1.0,2.0,-1.0,-2.0,-3.0,38.0,-5.0,-1.0,0.5,-0.5,-1.0,-7.0,-0.5,2.0,-1.0,-1.0,0.5,-0.5,1.0,0.5,0.5,2.0,1.0,-1.0,2.0,-0.5,-3.0,-5.0,-5.0,-5.0,4.5,1.0,-1.0,14.0,2.0,2.0,1.0,11.0,1.0,0.5,-9.5,-4.5,-9.5,1.0,1.0,0.5,-9.5,3.0,1.0,-0.5,9.5,2.0,-2.0,0.5,0.5,1.0,0.5,1.0,1.0,1.0,-3.0,0.5,3.0,-4.0,14.0,1.0,0.5,3.0,1.0,-2.0,1.0,3.0,-1.0,-1.0,1.0,-1.0,0.5,4.0,-1.0,-1.0,-3.0,-1.0,64.0,0.5,0.5,-0.5,5.0,1.0,3.0,3.0,0.5,-1.0,1.0,-17.0,-1.0,-1.0],"baseline_reward_curve":[-1.0,3.0,-1.0,23.0,-4.0,2.0,6.5,-5.0,13.0,-10.0,8.0,-7.0,14.0,16.0,-4.0,-4.2,4.0,-4.0,6.5,9.0,2.0,-5.0,2.0,2.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-1.0,1.0,2.0,-6.0,10.0,-0.5,-0.5,2.0,3.0,2.0,-7.0,-7.0,2.5,10.0,4.0,-2.0,-9.0,-2.0,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,4.0,1.0,-11.2,2.0,20.0,0.5,-3.0,-4.0,4.0,-0.5,0.5,-2.0,4.0,-2.0,0.5,5.0,-2.0,3.0,-2.0,1.0,15.0,1.0,-4.0,0.5,-4.0,-0.5,-2.0,0.5,0.5,1.0,-1.0,4.0,1.0,0.5,2.0,-2.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,6.0,-1.0,2.0,5.0,-1.0,2.0,16.0,5.0,0.5,-4.0],"gain_curve":[14.5,-12.0,34.0,6.5,1.0,3.0,-3.5,7.0,-12.0,8.0,-19.2,2.0,-9.0,18.5,-31.0,2.2,-9.0,6.0,-1.5,-10.0,-1.0,5.5,8.0,-1.0,-2.0,2.5,-1.5,-1.0,-3.5,39.0,-7.0,-1.5,1.5,2.5,-1.5,-7.5,1.5,2.0,-2.0,-11.0,1.5,1.5,2.0,-0.5,-1.5,8.0,-9.0,-0.5,2.5,-2.5,-6.0,-7.0,2.0,2.0,2.0,-9.0,-5.0,16.0,11.0,4.0,0.0,10.5,0.5,2.5,-10.5,-5.5,-10.5,-3.0,0.0,11.7,-11.5,-17.0,0.5,2.5,13.5,-2.0,-1.5,0.0,2.5,-3.0,2.5,0.5,-4.0,3.0,-6.0,2.5,2.0,-19.0,13.0,5.0,0.0,7.0,1.5,0.0,0.5,2.5,-2.0,0.0,-3.0,-2.0,0.0,2.0,1.0,-2.0,-2.0,-3.0,63.5,11.5,-1.5,-3.5,-1.0,2.0,1.0,-2.0,1.5,-3.0,-15.0,-22.0,-1.5,3.0],"cost_curve":[0.0145075,0.012306,0.014405,0.0148485,0.015715,0.016597,0.0176815,0.019729,0.020609,0.021894,0.023704,0.0253255,0.0262645,0.027647,0.029191,0.02987,0.03117,0.0329125,0.0337745,0.034072,0.01799,0.0,0.0374435,0.0184995,0.0191945,0.0295625,0.0103495,0.0297635,0.0509105,0.053678,0.055592,0.0228695,0.0,0.011569,0.0233335,0.06035,0.0123565,0.0370955,0.0369465,0.0252135,0.0,0.0130125,0.026123,0.0,0.0,0.040093,0.013507,0.027002,0.041485,0.0140495,0.0554375,0.058187,0.059027,0.06109,0.0622165,0.062373,0.0638145,0.066365,0.0670995,0.0681165,0.034387,0.070397,0.017763,0.0,0.071487,0.072992,0.0748145,0.0375135,0.0190485,0.0,0.0775555,0.0584135,0.0391845,0.019708,0.079126,0.080884,0.0818065,0.0,0.0,0.042177,0.0,0.0420135,0.0208535,0.021077,0.0859325,0.0,0.04345,0.088387,0.089407,0.022533,0.0,0.0453335,0.045293,0.0921415,0.0465935,0.047048,0.0469705,0.0717565,0.0242475,0.0477545,0.0,0.0732075,0.049035,0.024952,0.1244345,0.0251645,0.1279,0.0,0.0,0.026167,0.1817,0.0787485,0.0531145,0.053332,0.0,0.026893,0.0268735,0.163866,0.0276355,0.054878]},{"run_name":"icl-gpt-5.4","task":"sales_prediction","run_index":0,"reward":8.3858,"baseline_reward":5.5973,"reference_reward":12.0,"gain":2.7885,"normalized_reward":0.4355193902572352,"normalized_gain":0.4355193902572352,"cost_usd":3.431588,"latency_seconds":10.044379,"instance_count":12,"reward_curve":[0.5524,0.6027,0.7174,0.758,0.7636,0.7059,0.7206,0.7523,0.6933,0.706,0.7111,0.7025],"baseline_reward_curve":[0.5667,0.4004,0.4451,0.6295,0.5362,0.4262,0.5518,0.2929,0.4103,0.5599,0.4392,0.3391],"gain_curve":[-0.01429999999999998,0.20230000000000004,0.27230000000000004,0.12850000000000006,0.22739999999999994,0.27969999999999995,0.16880000000000006,0.4594,0.28300000000000003,0.1461,0.2719,0.3634],"cost_curve":[0.10503,0.1557635,0.268785,0.2497235,0.2734875,0.291811,0.294413,0.3211925,0.3576685,0.3474675,0.371026,0.39522]},{"run_name":"icl-gpt-5.4","task":"sales_prediction","run_index":1,"reward":9.7085,"baseline_reward":5.5973,"reference_reward":12.0,"gain":4.111200000000001,"normalized_reward":0.6421041123276119,"normalized_gain":0.6421041123276119,"cost_usd":3.5422345,"latency_seconds":13.60871,"instance_count":12,"reward_curve":[0.6124,0.5758,0.7612,0.7726,0.8099,0.8245,0.8338,0.9123,0.8504,0.8947,0.9293,0.9316],"baseline_reward_curve":[0.5667,0.4004,0.4451,0.6295,0.5362,0.4262,0.5518,0.2929,0.4103,0.5599,0.4392,0.3391],"gain_curve":[0.045700000000000074,0.1754,0.3161,0.1431,0.27369999999999994,0.3983,0.28200000000000003,0.6194,0.44010000000000005,0.3348000000000001,0.49010000000000004,0.5925],"cost_curve":[0.094688,0.1634915,0.2546495,0.216634,0.2580855,0.2972675,0.3363085,0.325191,0.34889,0.385028,0.417906,0.444095]},{"run_name":"icl-gpt-5.4","task":"sales_prediction","run_index":2,"reward":8.9053,"baseline_reward":5.5973,"reference_reward":12.0,"gain":3.3080000000000007,"normalized_reward":0.5166570353132274,"normalized_gain":0.5166570353132274,"cost_usd":3.7612045,"latency_seconds":11.630132,"instance_count":12,"reward_curve":[0.6143,0.6162,0.7416,0.8008,0.7542,0.7617,0.7662,0.8015,0.7434,0.76,0.7714,0.774],"baseline_reward_curve":[0.5667,0.4004,0.4451,0.6295,0.5362,0.4262,0.5518,0.2929,0.4103,0.5599,0.4392,0.3391],"gain_curve":[0.047599999999999976,0.2158,0.29650000000000004,0.1713,0.21799999999999997,0.3355,0.21440000000000003,0.5085999999999999,0.33309999999999995,0.20010000000000006,0.3322,0.4349],"cost_curve":[0.0986905,0.1648125,0.261979,0.2171965,0.259068,0.299575,0.33914,0.3533025,0.388458,0.425304,0.4580575,0.495621]},{"run_name":"icl-gpt-5.4","task":"sales_prediction","run_index":3,"reward":9.5659,"baseline_reward":5.5973,"reference_reward":12.0,"gain":3.9685999999999995,"normalized_reward":0.6198322582660439,"normalized_gain":0.6198322582660439,"cost_usd":3.4994805,"latency_seconds":10.907219,"instance_count":12,"reward_curve":[0.5012,0.6173,0.8059,0.7773,0.7984,0.8228,0.8486,0.8976,0.8224,0.8679,0.9024,0.9041],"baseline_reward_curve":[0.5667,0.4004,0.4451,0.6295,0.5362,0.4262,0.5518,0.2929,0.4103,0.5599,0.4392,0.3391],"gain_curve":[-0.0655,0.21689999999999998,0.36079999999999995,0.14780000000000004,0.2622,0.39659999999999995,0.29680000000000006,0.6047,0.4121,0.30800000000000005,0.4632,0.565],"cost_curve":[0.1396015,0.1615735,0.259149,0.2224605,0.2629155,0.2675605,0.294562,0.3230855,0.3503615,0.378334,0.408249,0.431628]},{"run_name":"icl-gpt-5.4","task":"sales_prediction","run_index":4,"reward":9.5846,"baseline_reward":5.5973,"reference_reward":12.0,"gain":3.9873000000000003,"normalized_reward":0.62275290111984,"normalized_gain":0.62275290111984,"cost_usd":3.3646275,"latency_seconds":10.530193,"instance_count":12,"reward_curve":[0.4872,0.5924,0.815,0.7882,0.7953,0.8266,0.8201,0.8991,0.8332,0.8864,0.9284,0.9127],"baseline_reward_curve":[0.5667,0.4004,0.4451,0.6295,0.5362,0.4262,0.5518,0.2929,0.4103,0.5599,0.4392,0.3391],"gain_curve":[-0.07949999999999996,0.19200000000000006,0.36989999999999995,0.15870000000000006,0.2591,0.4004,0.2683000000000001,0.6062000000000001,0.42290000000000005,0.3265,0.4892,0.5735999999999999],"cost_curve":[0.120018,0.1659185,0.1934145,0.2095735,0.2454395,0.2832605,0.295363,0.325181,0.3573295,0.365015,0.3893055,0.414809]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","run_index":0,"reward":36.49629999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":16.736599999999992,"normalized_reward":0.23826221899514496,"normalized_gain":0.23827631715695963,"cost_usd":2.870301,"latency_seconds":18.7514,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3564,0.3454,0.3343,0.3333,0.3114,0.3565,0.3252,0.3955,0.353,0.3735,0.4803,0.4604,0.5166,0.5428,0.4756,0.4973,0.5138,0.5072,0.5043,0.4994,0.4903,0.4756,0.3986,0.3876,0.4191,0.4186,0.3863,0.4196,0.4136,0.348,0.3969,0.3904,0.4305,0.3878,0.381,0.3658,0.4049,0.4118,0.4644,0.4961,0.4857,0.4104,0.4353,0.5254,0.4573,0.3758,0.3809,0.4847,0.4231,0.4122,0.4165,0.5055,0.4125,0.3909,0.4242,0.4513,0.3919,0.3792,0.3742,0.4181,0.412,0.3893,0.3477,0.4866,0.4591,0.447,0.3679,0.4346,0.5335,0.3166,0.4461,0.3561,0.4384,0.348,0.3928,0.3356,0.317,0.2756,0.3009,0.4444,0.3264,0.3566,0.3968,0.2769,0.3125,0.4034,0.3947],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.13,0.12129999999999999,0.1215,0.10599999999999998,0.1164,0.1344,0.11259999999999998,0.15510000000000002,0.12449999999999997,0.1542,0.232,0.26839999999999997,0.31919999999999993,0.31889999999999996,0.24860000000000002,0.29080000000000006,0.2664,0.3054,0.3024,0.2864,0.28200000000000003,0.25120000000000003,0.1653,0.1782,0.20859999999999998,0.18740000000000004,0.17909999999999998,0.22139999999999999,0.20510000000000003,0.13849999999999998,0.19419999999999998,0.16690000000000002,0.2166,0.18489999999999998,0.1396,0.1685,0.1846,0.1854,0.2718,0.25639999999999996,0.2641,0.18309999999999998,0.20790000000000003,0.30389999999999995,0.2264,0.14250000000000002,0.15220000000000003,0.267,0.20159999999999997,0.20470000000000002,0.20379999999999998,0.2808999999999999,0.18729999999999997,0.19110000000000002,0.18810000000000002,0.25579999999999997,0.1763,0.13729999999999998,0.16279999999999997,0.20150000000000004,0.19099999999999998,0.19119999999999998,0.1322,0.25939999999999996,0.20390000000000003,0.2382,0.1467,0.1805,0.31999999999999995,0.06939999999999999,0.2158,0.13530000000000003,0.20070000000000002,0.10579999999999998,0.17989999999999998,0.08680000000000002,0.11730000000000002,0.06770000000000001,0.08330000000000001,0.22780000000000003,0.11630000000000001,0.13729999999999998,0.1964,0.07729999999999998,0.11080000000000001,0.15919999999999998,0.1725],"cost_curve":[0.007944,0.012609,0.017052,0.023709,0.026883,0.027387,0.033273,0.033255,0.034974,0.038379,0.040806,0.043374,0.04263,0.046365,0.050154,0.056073,0.029619,0.023949,0.023829,0.022875,0.021891,0.023049,0.021948,0.025329,0.027465,0.026163,0.027192,0.028578,0.030825,0.03162,0.032553,0.030603,0.033006,0.030237,0.029913,0.029703,0.028497,0.030285,0.030153,0.031956,0.031995,0.030714,0.031806,0.031932,0.035796,0.033009,0.032592,0.031524,0.03336,0.031587,0.03204,0.035019,0.031704,0.032373,0.034998,0.034683,0.034446,0.030918,0.031365,0.032907,0.035712,0.032523,0.03474,0.032694,0.031029,0.033591,0.031716,0.03183,0.033039,0.032205,0.032238,0.034152,0.033408,0.034356,0.033138,0.035127,0.035703,0.032814,0.033522,0.034248,0.033816,0.035079,0.034092,0.03354,0.038925,0.034734,0.037716,0.035094,0.035544,0.023103]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","run_index":1,"reward":32.805,"baseline_reward":19.7597,"reference_reward":90.0,"gain":13.045300000000001,"normalized_reward":0.18570879425959932,"normalized_gain":0.18572386507460817,"cost_usd":2.489091,"latency_seconds":14.35801,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2283,0.261,0.2387,0.2614,0.2094,0.2608,0.2539,0.3187,0.338,0.3237,0.2521,0.2607,0.282,0.3456,0.357,0.2893,0.2956,0.3917,0.3342,0.3206,0.3307,0.3438,0.3202,0.3532,0.3779,0.3614,0.3304,0.3683,0.42,0.3756,0.3467,0.3672,0.4363,0.5,0.5184,0.528,0.3071,0.3081,0.2836,0.4123,0.3576,0.3238,0.4034,0.3575,0.3167,0.3208,0.3543,0.3436,0.3154,0.2665,0.3056,0.3748,0.4256,0.3554,0.3618,0.4132,0.4589,0.4291,0.3554,0.4449,0.3754,0.428,0.4032,0.398,0.2988,0.3943,0.3868,0.3673,0.5011,0.3406,0.3816,0.4543,0.3905,0.446,0.3954,0.4451,0.4951,0.4728,0.4858,0.4167,0.3976,0.4232,0.5316,0.3578,0.472,0.4544,0.3899,0.3738],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.016600000000000004,0.03460000000000002,0.014600000000000002,0.04860000000000003,-0.0179,0.06579999999999997,0.03180000000000002,0.10609999999999997,0.09760000000000002,0.09519999999999998,0.032799999999999996,0.012399999999999994,0.08999999999999997,0.14820000000000003,0.1331,0.062299999999999994,0.08909999999999998,0.14429999999999998,0.1324,0.1187,0.1177,0.13549999999999998,0.0958,0.1199,0.1685,0.1509,0.09920000000000004,0.16110000000000002,0.2218,0.1671,0.13720000000000002,0.16450000000000004,0.21280000000000002,0.2861,0.3155,0.2866,0.10979999999999998,0.08779999999999999,0.05720000000000003,0.2197,0.11789999999999998,0.10219999999999999,0.17609999999999998,0.1301,0.09519999999999998,0.08989999999999998,0.121,0.11490000000000003,0.09770000000000001,0.04500000000000001,0.09809999999999999,0.16210000000000002,0.20099999999999998,0.13019999999999998,0.162,0.1771,0.26339999999999997,0.21349999999999997,0.11349999999999999,0.2335,0.15880000000000002,0.207,0.2051,0.18250000000000002,0.0716,0.1391,0.17799999999999996,0.1461,0.247,0.12710000000000002,0.1344,0.22399999999999998,0.16970000000000002,0.2083,0.15319999999999998,0.2322,0.2463,0.2731,0.27790000000000004,0.19910000000000003,0.18100000000000002,0.2131,0.31229999999999997,0.1574,0.2724,0.25270000000000004,0.14570000000000002,0.1516],"cost_curve":[0.01122,0.016422,0.017982,0.017037,0.018477,0.017112,0.013713,0.015693,0.018036,0.020427,0.02175,0.020886,0.017151,0.015771,0.019812,0.022344,0.021381,0.01992,0.025905,0.024744,0.023148,0.026919,0.023364,0.022047,0.024675,0.022593,0.025122,0.022581,0.022962,0.023541,0.024429,0.026109,0.025017,0.026076,0.027141,0.027063,0.027063,0.027495,0.026247,0.027741,0.028296,0.027555,0.026796,0.02808,0.026697,0.026283,0.025842,0.026352,0.025758,0.028971,0.027447,0.026646,0.027348,0.030507,0.0321,0.028509,0.029628,0.030795,0.033339,0.030639,0.030546,0.031113,0.034353,0.032988,0.034272,0.031263,0.03402,0.033417,0.035238,0.037824,0.036405,0.035733,0.034092,0.038583,0.033894,0.036339,0.034968,0.038991,0.037188,0.035379,0.037578,0.036777,0.035625,0.036171,0.036057,0.040329,0.035067,0.034428,0.033864,0.021885]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","run_index":2,"reward":28.93129999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":9.17159999999999,"normalized_reward":0.13055852161904336,"normalized_gain":0.13057461314943117,"cost_usd":2.389614,"latency_seconds":13.110182,"instance_count":90,"reward_curve":[0.2482,0.2811,0.3103,0.301,0.2953,0.3252,0.3954,0.383,0.3978,0.3906,0.462,0.4591,0.4344,0.4532,0.4372,0.4458,0.4582,0.4582,0.4582,0.4478,0.4478,0.3266,0.2876,0.298,0.1926,0.2521,0.2509,0.2464,0.2478,0.2584,0.2831,0.2568,0.253,0.245,0.2836,0.2724,0.2435,0.2549,0.243,0.2544,0.3197,0.3006,0.2785,0.2586,0.3151,0.3348,0.3352,0.3106,0.2794,0.2555,0.3089,0.3327,0.3367,0.3033,0.2653,0.2164,0.2295,0.2396,0.2687,0.3212,0.3086,0.3666,0.3229,0.413,0.3454,0.2885,0.2975,0.3067,0.3533,0.3858,0.3059,0.3072,0.3063,0.3181,0.3513,0.348,0.4298,0.454,0.42,0.4556,0.1987,0.3782,0.2649,0.2886,0.302,0.3952,0.2458,0.2412,0.2473,0.2347],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.03290000000000001,0.09860000000000002,0.0746,0.07120000000000001,0.1124,0.16809999999999997,0.188,0.1757,0.178,0.22160000000000002,0.2306,0.2151,0.2049,0.24519999999999997,0.24839999999999998,0.2343,0.2312,0.25170000000000003,0.20039999999999997,0.24599999999999997,0.1247,0.07460000000000003,0.08969999999999997,-0.031799999999999995,0.018799999999999983,0.04150000000000001,0.035900000000000015,0.016600000000000004,0.05120000000000002,0.08490000000000003,0.04829999999999998,0.04350000000000001,0.042300000000000004,0.060100000000000015,0.05849999999999997,0.0406,0.013500000000000012,0.04569999999999999,0.03410000000000002,0.0933,0.10799999999999998,0.03880000000000003,0.037000000000000005,0.08779999999999999,0.1074,0.1137,0.0797,0.046099999999999974,0.026800000000000018,0.0912,0.1112,0.1292,0.09060000000000001,0.040699999999999986,-0.008800000000000002,0.029700000000000004,0.003500000000000003,0.07319999999999999,0.10559999999999997,0.06669999999999998,0.15519999999999998,0.10630000000000003,0.19199999999999998,0.1473,0.07299999999999998,0.07029999999999997,0.05149999999999999,0.1445,0.16459999999999997,0.05180000000000001,0.09369999999999998,0.059100000000000014,0.08779999999999999,0.1305,0.11029999999999998,0.18760000000000002,0.2411,0.1712,0.2559,-0.009200000000000014,0.1606,0.04830000000000004,0.07850000000000001,0.0827,0.1948,0.04619999999999999,0.03950000000000001,0.0030999999999999917,0.012499999999999983],"cost_curve":[0.008778,0.015054,0.021564,0.023829,0.02748,0.024174,0.02676,0.030516,0.033762,0.031995,0.029772,0.027987,0.027423,0.027885,0.031791,0.039006,0.036015,0.03012,0.032094,0.033798,0.02667,0.021717,0.019215,0.019206,0.017502,0.021537,0.019203,0.021789,0.021525,0.023592,0.023748,0.023358,0.021102,0.02361,0.023085,0.024336,0.024918,0.02787,0.027072,0.031599,0.028131,0.028191,0.031152,0.032373,0.030798,0.029415,0.030225,0.032256,0.031872,0.032889,0.034917,0.031995,0.040809,0.032052,0.032106,0.023175,0.024519,0.021915,0.022257,0.022452,0.023049,0.026565,0.026922,0.027732,0.025875,0.027144,0.028224,0.028209,0.02787,0.027276,0.027786,0.02544,0.02706,0.028209,0.033921,0.028677,0.032844,0.032232,0.034053,0.02946,0.01938,0.027174,0.017169,0.019173,0.020688,0.022743,0.01782,0.01902,0.019773,0.02217]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","run_index":3,"reward":38.77570000000001,"baseline_reward":19.7597,"reference_reward":90.0,"gain":19.01600000000001,"normalized_reward":0.2707142755449252,"normalized_gain":0.27072777308752966,"cost_usd":3.326742,"latency_seconds":21.974051,"instance_count":90,"reward_curve":[0.192,0.2393,0.2258,0.2357,0.2399,0.2331,0.2547,0.2763,0.2717,0.2961,0.2967,0.3462,0.3448,0.3748,0.331,0.3498,0.3759,0.3747,0.3974,0.4039,0.3572,0.3834,0.3816,0.3978,0.3506,0.374,0.4227,0.4759,0.4492,0.4661,0.4545,0.4281,0.5029,0.5252,0.4934,0.4349,0.4104,0.4717,0.4644,0.526,0.5201,0.5455,0.3988,0.4372,0.6009,0.4888,0.4995,0.4968,0.5297,0.5477,0.5942,0.4705,0.5412,0.539,0.5098,0.4149,0.5225,0.4826,0.526,0.4303,0.4254,0.4679,0.5686,0.4677,0.4643,0.5015,0.4975,0.4081,0.402,0.5503,0.5168,0.5892,0.4531,0.4048,0.3456,0.5656,0.3917,0.5184,0.4388,0.5083,0.4689,0.4895,0.4852,0.4658,0.4595,0.5337,0.4762,0.3033,0.3234,0.3588],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.014100000000000001,0.009300000000000003,0.01580000000000001,0.020300000000000012,0.02739999999999998,0.08129999999999998,0.049600000000000005,0.08349999999999996,0.05630000000000002,0.1177,0.1255,0.12650000000000003,0.139,0.1524,0.15200000000000002,0.14769999999999997,0.1909,0.15649999999999997,0.1554,0.18150000000000002,0.1686,0.18949999999999997,0.12620000000000003,0.1407,0.21330000000000002,0.26539999999999997,0.218,0.2589,0.25630000000000003,0.2196,0.2934,0.3225,0.26990000000000003,0.221,0.2075,0.2303,0.2671,0.3057,0.2937,0.3529,0.1591,0.21559999999999999,0.3736,0.2614,0.278,0.2659,0.29639999999999994,0.31899999999999995,0.37649999999999995,0.24899999999999997,0.3337,0.32630000000000003,0.2852,0.18969999999999998,0.3227,0.24649999999999997,0.3305,0.2147,0.1835,0.25649999999999995,0.352,0.2467,0.2662,0.2859999999999999,0.2703,0.15290000000000004,0.1932,0.3291,0.26270000000000004,0.3756999999999999,0.2059,0.1745,0.12480000000000002,0.32789999999999997,0.1495,0.3055,0.19000000000000003,0.3086,0.261,0.27190000000000003,0.26860000000000006,0.2557,0.24020000000000002,0.33329999999999993,0.2766,0.10160000000000002,0.07920000000000002,0.1366],"cost_curve":[0.009696,0.014553,0.01833,0.019572,0.021114,0.018453,0.022569,0.023862,0.026043,0.028212,0.027582,0.030057,0.033177,0.033021,0.035655,0.036504,0.038025,0.042021,0.047688,0.044643,0.04347,0.045192,0.045957,0.04629,0.048264,0.051363,0.055773,0.056865,0.056649,0.058506,0.061899,0.064191,0.064578,0.071094,0.069948,0.052227,0.048903,0.051612,0.053436,0.053385,0.040533,0.041478,0.04167,0.040788,0.042507,0.04353,0.0444,0.04587,0.047127,0.047793,0.036282,0.029928,0.028608,0.030864,0.031704,0.02841,0.028311,0.029499,0.02796,0.028305,0.028893,0.028989,0.029772,0.029832,0.028656,0.030555,0.030423,0.033636,0.03243,0.033432,0.031362,0.031272,0.033315,0.031689,0.033792,0.032472,0.033051,0.03126,0.031614,0.031686,0.033057,0.032184,0.031752,0.032676,0.032628,0.035103,0.02547,0.0237,0.023193,0.022902]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"blind_spectrum_monitoring","run_index":4,"reward":42.95660000000001,"baseline_reward":19.7597,"reference_reward":90.0,"gain":23.19690000000001,"normalized_reward":0.33023818676234007,"normalized_gain":0.330250582642728,"cost_usd":3.851712,"latency_seconds":23.459228,"instance_count":90,"reward_curve":[0.2273,0.2518,0.3384,0.3297,0.2747,0.2781,0.3165,0.3251,0.3113,0.3356,0.3265,0.3379,0.4044,0.3682,0.3825,0.4892,0.4842,0.5123,0.4772,0.4858,0.4796,0.5362,0.5075,0.4881,0.5045,0.5088,0.5412,0.5062,0.5139,0.399,0.5062,0.5905,0.6291,0.6072,0.5815,0.6169,0.5698,0.5979,0.5716,0.567,0.5744,0.5741,0.4796,0.4482,0.5588,0.5735,0.484,0.5297,0.5096,0.4701,0.5224,0.5408,0.6326,0.5557,0.5149,0.5104,0.5504,0.5154,0.5392,0.532,0.5639,0.5604,0.5333,0.5257,0.5192,0.4436,0.4491,0.437,0.4628,0.547,0.4984,0.5459,0.3897,0.5075,0.4499,0.5336,0.5429,0.3723,0.3755,0.4164,0.5015,0.4348,0.5154,0.5877,0.4578,0.4019,0.4185,0.4169,0.4109,0.4419],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.00360000000000002,0.12669999999999998,0.1033,0.050600000000000006,0.06530000000000002,0.0892,0.1301,0.08920000000000003,0.123,0.08610000000000001,0.10939999999999997,0.1851,0.11990000000000003,0.1905,0.29180000000000006,0.26030000000000003,0.2853,0.27070000000000005,0.2384,0.27780000000000005,0.33430000000000004,0.2945,0.27979999999999994,0.28009999999999996,0.2755,0.3318,0.29569999999999996,0.28270000000000006,0.19180000000000003,0.308,0.382,0.4196,0.40449999999999997,0.358,0.403,0.3669,0.3565,0.37429999999999997,0.34669999999999995,0.34800000000000003,0.38150000000000006,0.23990000000000003,0.2266,0.33149999999999996,0.3461,0.26249999999999996,0.29879999999999995,0.27630000000000005,0.24140000000000003,0.30469999999999997,0.3192999999999999,0.42510000000000003,0.34299999999999997,0.2903,0.28519999999999995,0.3506,0.2793,0.3437,0.3164,0.32199999999999995,0.349,0.3167,0.30469999999999997,0.3211,0.2281,0.2219,0.18180000000000002,0.254,0.32580000000000003,0.24430000000000002,0.33240000000000003,0.1425,0.27719999999999995,0.22910000000000003,0.29589999999999994,0.3007000000000001,0.15940000000000001,0.1267,0.2167,0.29359999999999997,0.21720000000000003,0.29879999999999995,0.3776,0.2385,0.20149999999999998,0.21889999999999998,0.2152,0.1667,0.2197],"cost_curve":[0.009774,0.017328,0.026214,0.035142,0.039576,0.041628,0.04674,0.048993,0.053691,0.062268,0.062421,0.064788,0.066687,0.065493,0.071544,0.076305,0.079413,0.080244,0.082701,0.087267,0.08388,0.085983,0.087066,0.08856,0.041496,0.037023,0.037974,0.0405,0.047136,0.040545,0.030705,0.028668,0.033246,0.034557,0.034413,0.037347,0.037452,0.041613,0.042369,0.044643,0.046185,0.053574,0.034134,0.034896,0.038418,0.040302,0.041526,0.043083,0.044853,0.045552,0.04629,0.05067,0.045666,0.048423,0.05052,0.05256,0.031824,0.025641,0.02394,0.02517,0.02538,0.03072,0.027384,0.031413,0.031455,0.030579,0.033447,0.030051,0.032535,0.031884,0.027507,0.031326,0.033645,0.031056,0.032136,0.034107,0.030963,0.035472,0.03573,0.032955,0.031629,0.031185,0.031284,0.031242,0.031695,0.03171,0.030459,0.034953,0.03891,0.03228]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","run_index":0,"reward":9.800000000000002,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain":1.9250000000000034,"normalized_reward":0.036649214659686194,"normalized_gain":0.17303370786516883,"cost_usd":3.80859945,"latency_seconds":5.139154,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.725,0.8,0.625,0.625,0.4,0.7,0.575,0.775,0.525,0.925,0.775,0.2,0.375,0.775,0.2,0.8],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.075,0.725,0.5,0.675,0.525,0.675,0.6,0.925,0.85,0.45,0.0,0.0,0.325,0.725],"gain_curve":[0.0,0.0,0.0,0.725,-0.02499999999999991,0.55,-0.09999999999999998,-0.09999999999999998,0.02499999999999991,0.04999999999999993,0.09999999999999998,-0.07499999999999996,0.0,-0.07499999999999996,-0.25,0.375,0.775,-0.125,0.07500000000000007],"cost_curve":[0.0815097,0.1781397,0.1208541,0.08477175,0.08190195,0.22896825,0.16854315,0.29233785,0.15038685,0.7799424,0.06052215,0.2585076,0.025716,0.14474325,0.36843645,0.32266005,0.06624465,0.3132786,0.081135]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","run_index":1,"reward":8.450000000000001,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain":0.575000000000002,"normalized_reward":-0.10471204188481656,"normalized_gain":0.051685393258427144,"cost_usd":6.3363549,"latency_seconds":10.173983,"instance_count":19,"reward_curve":[0.8,0.725,0.55,0.0,0.575,0.0,0.35,0.825,0.0,0.0,0.825,0.0,0.725,0.075,0.825,0.825,0.55,0.8,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.075,0.725,0.5,0.675,0.525,0.675,0.6,0.925,0.85,0.45,0.0,0.0,0.325,0.725],"gain_curve":[0.8,0.725,0.55,0.0,-0.25,-0.075,-0.375,0.32499999999999996,-0.675,-0.525,0.1499999999999999,-0.6,-0.20000000000000007,-0.775,0.37499999999999994,0.825,0.55,0.47500000000000003,-0.725],"cost_curve":[0.12625125,0.1193529,0.20260485,0.39539205,0.11036835,0.32798385,0.45580875,0.0632865,0.1055772,2.26520805,0.0461145,0.4375329,0.41822805,0.6710667,0.0533019,0.0976182,0.30456165,0.09458025,0.041517]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","run_index":2,"reward":7.275,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain":-0.5999999999999988,"normalized_reward":-0.22774869109947632,"normalized_gain":-0.05393258426966281,"cost_usd":3.933855,"latency_seconds":4.380685,"instance_count":19,"reward_curve":[0.775,0.0,0.0,0.0,0.575,0.0,0.525,0.0,0.825,0.0,0.275,0.425,0.875,0.25,0.75,0.0,0.75,0.625,0.625],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.075,0.725,0.5,0.675,0.525,0.675,0.6,0.925,0.85,0.45,0.0,0.0,0.325,0.725],"gain_curve":[0.775,0.0,0.0,0.0,-0.25,-0.075,-0.19999999999999996,-0.5,0.1499999999999999,-0.525,-0.4,-0.175,-0.050000000000000044,-0.6,0.3,0.0,0.75,0.3,-0.09999999999999998],"cost_curve":[0.06698475,0.5675709,0.08887515,0.0969744,0.16861515,0.06604125,0.2320689,0.1691532,0.0961017,0.5321811,0.55412025,0.24808065,0.0376089,0.3633087,0.13210455,0.04534485,0.08918565,0.26794725,0.1115877]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","run_index":3,"reward":9.625,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain":1.7500000000000009,"normalized_reward":0.018324607329843003,"normalized_gain":0.15730337078651693,"cost_usd":3.56120265,"latency_seconds":4.393736,"instance_count":19,"reward_curve":[0.375,0.775,0.0,0.325,0.0,0.0,0.65,0.85,0.75,0.75,0.75,0.85,0.2,0.525,0.6,0.3,0.525,0.675,0.725],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.075,0.725,0.5,0.675,0.525,0.675,0.6,0.925,0.85,0.45,0.0,0.0,0.325,0.725],"gain_curve":[0.375,0.775,0.0,0.325,-0.825,-0.075,-0.07499999999999996,0.35,0.07499999999999996,0.22499999999999998,0.07499999999999996,0.25,-0.7250000000000001,-0.32499999999999996,0.14999999999999997,0.3,0.525,0.35000000000000003,0.0],"cost_curve":[0.2732124,0.09512595,0.15885435,0.3748392,0.08816505,0.0596619,0.09455685,0.1155723,0.0770778,0.12780405,0.10649925,0.0450402,0.4102968,0.3357903,0.32249145,0.3935817,0.32273775,0.0926589,0.06723645]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"codebase_adaptation","run_index":4,"reward":8.674999999999999,"baseline_reward":7.874999999999999,"reference_reward":19.0,"gain":0.7999999999999998,"normalized_reward":-0.08115183246073301,"normalized_gain":0.07191011235955054,"cost_usd":3.56872845,"latency_seconds":5.116408,"instance_count":19,"reward_curve":[0.0,0.65,0.0,0.625,0.375,0.85,0.0,0.75,0.0,0.8,0.825,0.9,0.05,0.475,0.0,0.725,0.6,0.3,0.75],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.075,0.725,0.5,0.675,0.525,0.675,0.6,0.925,0.85,0.45,0.0,0.0,0.325,0.725],"gain_curve":[0.0,0.65,0.0,0.625,-0.44999999999999996,0.775,-0.725,0.25,-0.675,0.275,0.1499999999999999,0.30000000000000004,-0.875,-0.375,-0.45,0.725,0.6,-0.025000000000000022,0.025000000000000022],"cost_curve":[0.0531951,0.18646005,0.1015335,0.19473915,0.5324745,0.0605964,0.08859495,0.1632075,0.14348715,0.06518745,0.07541085,0.04028985,0.61111125,0.32417595,0.21060705,0.08357775,0.2345814,0.3088749,0.0906237]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","run_index":0,"reward":-0.14062699999999995,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain":1.4385370000000002,"normalized_reward":-0.5046040523840428,"normalized_gain":0.2982534016807085,"cost_usd":11.27089695,"latency_seconds":25.005242,"instance_count":20,"reward_curve":[-0.068531,0.001507,-0.029386,-0.022393,-0.012956,0.005972,0.010176,-0.035135,-0.048244,-0.053257,-0.075927,-0.021505,0.030895,-0.048172,-0.077884,0.009458,-0.018272,0.03574,0.215347,0.06194],"baseline_reward_curve":[-0.061582,-0.033378,-0.064619,-0.273522,-0.126519,-0.041676,-0.013652,-0.042472,-0.028838,-0.070315,-0.064211,-0.024661,0.054052,-0.096784,0.044672,0.00579,-0.597213,-0.025139,-0.129357,0.01026],"gain_curve":[-0.006948999999999997,0.034885,0.035233,0.251129,0.113563,0.047647999999999996,0.023828,0.007337000000000003,-0.019406000000000003,0.017058000000000004,-0.01171599999999999,0.003155999999999999,-0.023157000000000004,0.048611999999999995,-0.122556,0.0036679999999999994,0.578941,0.060879,0.344704,0.051680000000000004],"cost_curve":[0.5283141,0.49954215,0.59165865,0.74446995,0.89200155,0.90517905,0.7142169,0.71041455,0.7670355,0.3105102,0.4153782,0.67538205,0.78563595,0.5718417,0.3659685,0.45874875,0.4272126,0.31083165,0.2943396,0.30221535]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","run_index":1,"reward":-0.3908700000000001,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain":1.1882940000000002,"normalized_reward":-0.6158459062741716,"normalized_gain":0.2463702551250165,"cost_usd":11.6509797,"latency_seconds":24.402169,"instance_count":20,"reward_curve":[-0.051888,-0.080659,-0.043381,-0.0187,0.020167,-0.015809,-0.0869,-0.021294,-0.023932,-0.055472,-0.039491,-0.009241,0.023885,0.008038,0.063264,-0.010723,-0.044918,-0.01266,-0.013649,0.022493],"baseline_reward_curve":[-0.061582,-0.033378,-0.064619,-0.273522,-0.126519,-0.041676,-0.013652,-0.042472,-0.028838,-0.070315,-0.064211,-0.024661,0.054052,-0.096784,0.044672,0.00579,-0.597213,-0.025139,-0.129357,0.01026],"gain_curve":[0.009694000000000001,-0.047281,0.021237999999999993,0.254822,0.14668599999999998,0.025866999999999998,-0.07324800000000001,0.021178000000000002,0.004906000000000001,0.014843000000000002,0.024720000000000006,0.01542,-0.030167000000000003,0.104822,0.018591999999999997,-0.016513,0.552295,0.012479000000000002,0.115708,0.012232999999999999],"cost_curve":[0.70156275,0.78578205,0.8165511,0.55553235,0.8365773,0.4225125,0.5220774,0.4775115,0.45389325,0.47991825,0.57692745,0.5679354,0.51654015,0.477963,0.4680228,0.6845124,0.65548275,0.3440631,0.626448,0.6811662]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","run_index":2,"reward":-2.111521,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain":-0.532357,"normalized_reward":-1.380736061594815,"normalized_gain":-0.11037414133841321,"cost_usd":11.09890905,"latency_seconds":23.089562,"instance_count":20,"reward_curve":[-0.023482,-0.087337,-0.040377,0.005644,-0.026997,-0.377818,-0.161564,-0.293872,-0.079121,-0.028299,-0.019471,-0.051384,-0.098815,-0.053642,0.014129,-0.083617,0.061649,-0.045205,-0.095033,-0.626909],"baseline_reward_curve":[-0.061582,-0.033378,-0.064619,-0.273522,-0.126519,-0.041676,-0.013652,-0.042472,-0.028838,-0.070315,-0.064211,-0.024661,0.054052,-0.096784,0.044672,0.00579,-0.597213,-0.025139,-0.129357,0.01026],"gain_curve":[0.038099999999999995,-0.053959,0.024241999999999993,0.27916599999999997,0.099522,-0.336142,-0.14791200000000002,-0.2514,-0.050282999999999994,0.042016,0.04474,-0.026723,-0.152867,0.04314199999999999,-0.030543000000000004,-0.089407,0.658862,-0.020066,0.03432399999999999,-0.6371690000000001],"cost_curve":[0.46130025,0.6643152,0.39204975,0.4510593,0.8784747,0.4460016,0.70216185,0.5402535,0.7822092,0.58461015,0.4422408,0.4453866,0.4230813,0.30059535,0.53062935,0.5914077,0.5714466,0.71486205,0.54626595,0.63055785]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","run_index":3,"reward":-0.944445,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain":0.6347190000000003,"normalized_reward":-0.8619295500413418,"normalized_gain":0.13159696334635654,"cost_usd":12.3896688,"latency_seconds":25.887679,"instance_count":20,"reward_curve":[-0.027179,-0.040447,-0.070548,-0.062486,-0.019813,0.011599,-0.02742,0.02072,-0.014728,-0.026815,-0.038708,-0.053941,-0.066811,0.042397,-0.03194,0.002204,-0.065919,-0.035868,-0.25083,-0.187912],"baseline_reward_curve":[-0.061582,-0.033378,-0.064619,-0.273522,-0.126519,-0.041676,-0.013652,-0.042472,-0.028838,-0.070315,-0.064211,-0.024661,0.054052,-0.096784,0.044672,0.00579,-0.597213,-0.025139,-0.129357,0.01026],"gain_curve":[0.034403,-0.007068999999999999,-0.005929000000000004,0.211036,0.106706,0.053274999999999996,-0.013768,0.063192,0.01411,0.043500000000000004,0.025503000000000005,-0.029280000000000004,-0.120863,0.139181,-0.07661200000000001,-0.003586,0.5312939999999999,-0.010728999999999995,-0.121473,-0.198172],"cost_curve":[0.79361925,0.66309855,0.67302,0.60208155,0.79767915,0.5234412,0.82397145,0.65918445,0.82278975,0.62188335,0.68110365,0.41386125,0.51935715,0.48579015,0.5024448,0.31786395,0.7440537,0.49762995,0.71177775,0.53501775]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"cohort_studies","run_index":4,"reward":-0.33196200000000003,"baseline_reward":-1.5791640000000002,"reference_reward":3.24404,"gain":1.2472020000000001,"normalized_reward":-0.5896592192181512,"normalized_gain":0.2585837132329464,"cost_usd":11.38810005,"latency_seconds":23.24014,"instance_count":20,"reward_curve":[-0.035681,-0.033693,-0.00337,-0.049187,-0.04587,-0.011002,-0.0058,-0.010269,-0.059777,-0.058895,-0.096055,-0.017663,-0.075892,-0.032239,0.004926,0.042899,0.115373,0.045635,0.004021,-0.009423],"baseline_reward_curve":[-0.061582,-0.033378,-0.064619,-0.273522,-0.126519,-0.041676,-0.013652,-0.042472,-0.028838,-0.070315,-0.064211,-0.024661,0.054052,-0.096784,0.044672,0.00579,-0.597213,-0.025139,-0.129357,0.01026],"gain_curve":[0.025901,-0.0003150000000000028,0.061249,0.22433499999999998,0.080649,0.030674,0.007852,0.032203,-0.030938999999999998,0.01142,-0.031844,0.006997999999999997,-0.129944,0.06454499999999999,-0.039746000000000004,0.037109,0.7125859999999999,0.070774,0.133378,-0.019683],"cost_curve":[0.76267335,0.68057415,0.3348906,0.63339165,0.6403389,0.64295025,0.6150891,0.868974,0.71940465,0.7655652,0.5141511,0.4175694,0.42909045,0.62268405,0.48983955,0.5208888,0.55197285,0.5488977,0.1988964,0.4302579]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","run_index":0,"reward":12.93333333333333,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain":5.599999999999998,"normalized_reward":0.2147001934235976,"normalized_gain":0.17142857142857135,"cost_usd":2.11322535,"latency_seconds":2.369584,"instance_count":40,"reward_curve":[0.0,0.0,0.2666666666666667,0.4666666666666667,0.6666666666666667,0.7333333333333334,0.6666666666666667,0.0,0.7333333333333334,0.6,0.4666666666666667,0.8,0.5333333333333333,0.5333333333333333,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.8,0.6,0.6,0.0,0.0,0.4,0.1333333333333333,0.0,0.19999999999999996,0.6,0.6,0.6,0.6666666666666667],"baseline_reward_curve":[0.0,0.0,0.4666666666666667,0.6,0.0,0.6,0.4666666666666667,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.6666666666666667,0.33333333333333337,0.6,0.0],"gain_curve":[0.0,0.0,-0.19999999999999996,-0.1333333333333333,0.6666666666666667,0.13333333333333341,0.20000000000000007,0.0,0.06666666666666665,0.6,-0.06666666666666665,0.1333333333333333,0.2666666666666666,0.19999999999999996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.33333333333333337,0.0,0.6666666666666667,0.0,0.7333333333333334,0.6,0.6,0.0,0.0,0.4,0.1333333333333333,0.0,0.06666666666666665,-0.06666666666666676,0.2666666666666666,0.0,0.6666666666666667],"cost_curve":[0.1009131,0.04230135,0.08063895,0.06501015,0.03912465,0.0368445,0.03977505,0.09890805,0.02808,0.04762935,0.05838315,0.022209,0.0617262,0.0530244,0.1207803,0.0590883,0.0265374,0.020031,0.0462891,0.06487845,0.047346,0.053895,0.04418085,0.0716553,0.018642,0.0361911,0.0341235,0.027291,0.0420675,0.05354955,0.014688,0.012531,0.08025195,0.0882375,0.0648294,0.1095216,0.0643731,0.04167375,0.05455365,0.04145115]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","run_index":1,"reward":10.399999999999999,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain":3.0666666666666664,"normalized_reward":0.14119922630560927,"normalized_gain":0.09387755102040814,"cost_usd":2.25657045,"latency_seconds":2.364311,"instance_count":40,"reward_curve":[0.5333333333333333,0.8,0.6666666666666667,0.6,0.0,0.2666666666666667,0.0,0.6666666666666667,0.0,0.7333333333333334,0.0,0.0,0.5333333333333333,0.0,0.4666666666666667,0.8,0.6,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2666666666666667,0.0,0.5333333333333333,0.4666666666666667,0.0,0.0,0.0,0.0,0.7333333333333334,0.6,0.4666666666666667],"baseline_reward_curve":[0.0,0.0,0.4666666666666667,0.6,0.0,0.6,0.4666666666666667,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.6666666666666667,0.33333333333333337,0.6,0.0],"gain_curve":[0.5333333333333333,0.8,0.20000000000000007,0.0,0.0,-0.33333333333333326,-0.4666666666666667,0.6666666666666667,-0.6666666666666667,0.7333333333333334,-0.5333333333333333,-0.6666666666666667,0.2666666666666666,-0.33333333333333337,0.4666666666666667,0.8,0.6,0.6666666666666667,-0.6,0.0,0.0,0.0,0.0,-0.33333333333333337,0.0,0.0,0.0,-0.06666666666666665,0.0,0.2666666666666667,0.0,0.5333333333333333,0.4666666666666667,0.0,0.0,-0.1333333333333333,-0.6666666666666667,0.4,0.0,0.4666666666666667],"cost_curve":[0.05759565,0.021693,0.0475995,0.05399325,0.09885615,0.0823167,0.1312746,0.03920175,0.0515106,0.031887,0.023691,0.020526,0.05367795,0.056517,0.05450205,0.020847,0.0519114,0.04242975,0.0666909,0.04023225,0.1013754,0.0431154,0.06263115,0.06501735,0.0540027,0.06599655,0.0755892,0.04992105,0.05234655,0.09397845,0.06262125,0.0495072,0.06276765,0.1104639,0.009993,0.06034725,0.05671095,0.025776,0.05010405,0.0573519]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","run_index":2,"reward":13.0,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain":5.666666666666668,"normalized_reward":0.21663442940038685,"normalized_gain":0.17346938775510204,"cost_usd":2.15163105,"latency_seconds":2.468706,"instance_count":40,"reward_curve":[0.0,0.0,0.33333333333333337,0.6,0.5333333333333333,0.5333333333333333,0.0,0.6666666666666667,0.0,0.0,0.8666666666666667,0.33333333333333337,0.0,0.0,0.7333333333333334,0.5333333333333333,0.7333333333333334,0.7333333333333334,0.6666666666666667,0.5333333333333333,0.6666666666666667,0.0,0.4,0.8,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.0,0.06666666666666665,0.6666666666666667,0.0,0.0,0.0,0.7333333333333334],"baseline_reward_curve":[0.0,0.0,0.4666666666666667,0.6,0.0,0.6,0.4666666666666667,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.6666666666666667,0.33333333333333337,0.6,0.0],"gain_curve":[0.0,0.0,-0.1333333333333333,0.0,0.5333333333333333,-0.06666666666666665,-0.4666666666666667,0.6666666666666667,-0.6666666666666667,0.0,0.33333333333333337,-0.33333333333333337,-0.2666666666666667,-0.33333333333333337,0.7333333333333334,0.5333333333333333,0.7333333333333334,0.7333333333333334,0.06666666666666676,0.5333333333333333,0.6666666666666667,0.0,0.4,0.4666666666666667,0.0,0.0,0.0,-0.06666666666666665,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.0,0.06666666666666665,0.5333333333333334,-0.6666666666666667,-0.33333333333333337,-0.6,0.7333333333333334],"cost_curve":[0.08880195,0.1244376,0.0853446,0.0457794,0.05628375,0.05104875,0.038856,0.0369549,0.023454,0.0383745,0.015087,0.06695805,0.0667218,0.049884,0.036288,0.0480087,0.04026525,0.04140225,0.04930635,0.06112305,0.048819,0.05384385,0.06546165,0.042192,0.03093825,0.009345,0.1291356,0.07244865,0.04624875,0.044961,0.1001367,0.0522351,0.04063275,0.020613,0.1479945,0.05092635,0.010647,0.0594105,0.0365205,0.024741]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","run_index":3,"reward":10.266666666666667,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain":2.9333333333333353,"normalized_reward":0.137330754352031,"normalized_gain":0.08979591836734699,"cost_usd":2.33806935,"latency_seconds":2.369236,"instance_count":40,"reward_curve":[0.0,0.5333333333333333,0.0,0.0,0.0,0.6666666666666667,0.7333333333333334,0.6666666666666667,0.0,0.6,0.4,0.7333333333333334,0.7333333333333334,0.0,0.0,0.5333333333333333,0.0,0.6,0.0,0.0,0.0,0.6,0.6666666666666667,0.0,0.7333333333333334,0.4666666666666667,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6666666666666667,0.2666666666666667,0.2666666666666667],"baseline_reward_curve":[0.0,0.0,0.4666666666666667,0.6,0.0,0.6,0.4666666666666667,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.6666666666666667,0.33333333333333337,0.6,0.0],"gain_curve":[0.0,0.5333333333333333,-0.4666666666666667,-0.6,0.0,0.06666666666666676,0.2666666666666667,0.6666666666666667,-0.6666666666666667,0.6,-0.1333333333333333,0.06666666666666665,0.4666666666666667,-0.33333333333333337,0.0,0.5333333333333333,0.0,0.6,-0.6,0.0,0.0,0.6,0.6666666666666667,-0.33333333333333337,0.7333333333333334,0.4666666666666667,0.0,-0.06666666666666665,0.4,0.0,0.0,0.0,0.0,0.0,0.0,-0.1333333333333333,-0.6666666666666667,0.33333333333333337,-0.33333333333333326,0.2666666666666667],"cost_curve":[0.09132915,0.06086055,0.091281,0.031524,0.03049875,0.0403635,0.03738,0.04068375,0.0275769,0.0430362,0.0616398,0.024861,0.0344385,0.06556875,0.0569952,0.05853225,0.06687555,0.05146695,0.0622476,0.05325885,0.04332015,0.04520025,0.0396792,0.034098,0.037242,0.0792771,0.0572829,0.08751645,0.0619836,0.0672978,0.10074585,0.0389472,0.06056985,0.0998559,0.05683995,0.04847685,0.1177641,0.0418254,0.10576455,0.083964]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"database_exploration","run_index":4,"reward":8.4,"baseline_reward":7.333333333333332,"reference_reward":40.0,"gain":1.0666666666666682,"normalized_reward":0.08317214700193427,"normalized_gain":0.03265306122448984,"cost_usd":2.34063795,"latency_seconds":2.274254,"instance_count":40,"reward_curve":[0.6666666666666667,0.4,0.0,0.0,0.0,0.8,0.0,0.4,0.0,0.7333333333333334,0.5333333333333333,0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.06666666666666665,0.2666666666666667,0.0,0.0,0.5333333333333333,0.0,0.6,0.0,0.0,0.6666666666666667,0.4666666666666667,0.0,0.7333333333333334,0.0,0.4,0.2666666666666667,0.0,0.0,0.0,0.19999999999999996,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.4666666666666667,0.6,0.0,0.6,0.4666666666666667,0.0,0.6666666666666667,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1333333333333333,0.6666666666666667,0.33333333333333337,0.6,0.0],"gain_curve":[0.6666666666666667,0.4,-0.4666666666666667,-0.6,0.0,0.20000000000000007,-0.4666666666666667,0.4,-0.6666666666666667,0.7333333333333334,0.0,-0.6666666666666667,-0.2666666666666667,0.33333333333333337,0.0,0.0,0.0,0.0,-0.5333333333333333,0.2666666666666667,0.0,0.0,0.5333333333333333,-0.33333333333333337,0.6,0.0,0.0,0.6000000000000001,0.4666666666666667,0.0,0.7333333333333334,0.0,0.4,0.2666666666666667,0.0,-0.1333333333333333,-0.6666666666666667,-0.13333333333333341,-0.6,0.0],"cost_curve":[0.0340785,0.06689535,0.0467628,0.04636965,0.029502,0.018387,0.1181415,0.08024475,0.0839829,0.03388575,0.0647994,0.03310515,0.0731073,0.03890475,0.04519035,0.0483348,0.04153695,0.0408123,0.11577615,0.0678018,0.04496985,0.0542004,0.0503865,0.0815043,0.04795275,0.05530305,0.07786185,0.03905625,0.0580149,0.09626835,0.026922,0.0549198,0.08400825,0.0955896,0.024393,0.0452568,0.07249335,0.1211826,0.033183,0.0495522]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","run_index":0,"reward":166.8,"baseline_reward":317.0,"reference_reward":1138.5,"gain":-150.2,"normalized_reward":0.03303811324509903,"normalized_gain":-0.18283627510651246,"cost_usd":6.5613117,"latency_seconds":16.281688,"instance_count":120,"reward_curve":[-2.2,-0.5,-2.4,2.8,-0.5,2.4,2.8,-2.8,13.5,-2.8,5.6,-22.3,2.8,6.3,-2.8,-1.0,21.3,-1.0,6.0,26.3,-1.0,-9.0,-1.0,1.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,-1.0,-3.0,0.5,0.5,-1.0,-1.0,1.0,8.0,-1.0,-1.0,-0.5,-0.5,-1.0,-1.0,2.0,-1.0,-0.5,-1.0,5.5,2.5,-5.0,-2.5,4.5,7.0,2.0,-3.5,-2.5,-1.0,1.0,0.5,0.5,-4.2,-1.0,1.0,-1.0,2.5,1.0,-2.2,1.0,9.5,0.5,-1.0,-1.0,-2.5,-0.5,0.5,-1.0,4.2,-1.0,0.5,2.5,-1.0,3.0,-1.0,1.0,6.5,1.0,-1.0,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-1.0,-1.0,1.0,0.5,-1.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,-1.0,2.0,1.0,2.0,3.0,-1.0,2.0,100.0,3.0,0.5,-4.0],"baseline_reward_curve":[-2.4,2.0,-2.4,12.5,-5.0,2.2,3.0,-1.0,34.0,-14.0,5.2,-3.7,4.5,21.0,-8.0,-5.9,4.6,-5.6,5.6,31.0,2.0,-5.0,-1.0,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,1.0,-6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-1.0,-0.5,-1.0,10.5,2.4,-7.0,-8.5,4.6,6.0,5.2,-4.4,-6.0,-2.5,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-4.6,4.5,17.0,0.5,-2.5,-2.5,3.5,-0.5,0.5,-6.0,1.0,1.0,0.5,16.0,3.0,3.0,-5.4,1.0,3.5,1.0,1.0,0.5,1.0,-0.5,-2.4,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-17.5,2.0,-1.0,2.0,1.0,2.0,3.0,5.0,2.0,100.0,3.0,0.5,-4.0],"gain_curve":[0.19999999999999973,-2.5,0.0,-9.7,4.5,0.19999999999999973,-0.20000000000000018,-1.7999999999999998,-20.5,11.2,0.39999999999999947,-18.6,-1.7000000000000002,-14.7,5.2,4.9,16.700000000000003,4.6,0.40000000000000036,-4.699999999999999,-3.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,-2.0,3.0,0.0,0.0,1.0,-1.0,0.0,-86.0,0.0,1.0,0.0,-1.5,0.0,1.0,-8.0,0.0,0.0,0.0,-5.0,0.10000000000000009,2.0,6.0,-0.09999999999999964,1.0,-3.2,0.9000000000000004,3.5,1.5,0.0,0.0,0.0,-2.2,-2.0,0.0,-2.0,1.5,0.0,2.3999999999999995,-3.5,-7.5,0.0,1.5,1.5,-6.0,0.0,0.0,5.0,3.2,-2.0,0.0,-13.5,-4.0,0.0,4.4,0.0,3.0,0.0,-2.0,0.0,-2.0,0.0,1.4,0.0,0.0,0.0,0.0,-5.0,-2.0,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,6.5,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,0.0,0.0,0.0,0.0,0.0],"cost_curve":[0.032454,0.010437,0.06141675,0.05841375,0.013314,0.065988,0.0849276,0.0579192,0.10916745,0.091365,0.09351855,0.1409412,0.0830694,0.07268115,0.06973875,0.08604195,0.06606915,0.08465625,0.0907359,0.0781584,0.016689,0.1183965,0.039552,0.032349,0.045399,0.009285,0.0,0.008871,0.0,0.010611,0.019809,0.0,0.03561,0.06607275,0.0,0.0,0.040182,0.041991,0.028857,0.117354,0.05537235,0.012675,0.014712,0.012147,0.060483,0.0589515,0.05156985,0.0317445,0.010131,0.06325425,0.0925476,0.0710412,0.0862689,0.0919947,0.108246,0.1117923,0.1065018,0.08088375,0.13464075,0.07165425,0.11475075,0.0,0.0,0.11423835,0.06753165,0.011193,0.05758935,0.06360225,0.031485,0.08862825,0.0751488,0.08860485,0.0,0.08761875,0.0874701,0.0827763,0.017922,0.0,0.07815525,0.0946977,0.08675955,0.0,0.09588435,0.09144165,0.03364875,0.0917826,0.015321,0.04803855,0.1038843,0.1054761,0.0,0.09677445,0.018939,0.0934923,0.0,0.0,0.12725115,0.04115775,0.04599525,0.034518,0.0,0.019494,0.018306,0.033993,0.033543,0.046815,0.0,0.16577115,0.0504477,0.03231,0.02298,0.023511,0.03912075,0.08012085,0.021279,0.03482175,0.07297425,0.020529,0.0,0.14486475]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","run_index":1,"reward":149.7,"baseline_reward":317.0,"reference_reward":1138.5,"gain":-167.3,"normalized_reward":0.016021494676087166,"normalized_gain":-0.20365185636031652,"cost_usd":6.61982865,"latency_seconds":15.90779,"instance_count":120,"reward_curve":[13.0,14.0,1.0,-1.0,3.5,3.5,-1.0,-3.5,6.0,6.0,-3.5,14.0,1.0,-1.0,5.5,-13.5,-3.5,-1.0,-3.0,-0.5,-2.0,-0.5,0.0,1.0,-0.5,0.5,10.0,-2.0,2.0,0.5,2.0,-1.0,0.5,2.0,-1.0,0.5,-1.0,-1.0,1.0,0.5,-2.0,-0.5,-2.0,1.0,-1.0,8.0,-3.0,-5.0,-0.5,-1.0,-6.0,7.0,-6.5,-3.0,-3.0,5.5,1.0,3.0,4.5,-1.0,3.0,1.0,-5.0,11.5,-1.0,-0.5,0.5,0.5,-2.4,3.5,0.5,-0.5,-0.5,5.9,1.0,-2.4,1.0,1.0,-2.4,-2.4,0.5,0.5,4.9,0.5,-0.5,-1.0,8.0,-1.0,-0.5,1.0,0.5,-1.0,1.0,1.0,-0.5,-2.0,0.5,-1.0,-1.0,-1.0,-25.0,-1.0,-1.0,0.5,-1.0,2.0,-1.0,-1.0,0.5,3.0,-2.0,2.0,100.0,-1.0,4.0,0.5,3.0,1.0,3.0,-1.0],"baseline_reward_curve":[-2.4,2.0,-2.4,12.5,-5.0,2.2,3.0,-1.0,34.0,-14.0,5.2,-3.7,4.5,21.0,-8.0,-5.9,4.6,-5.6,5.6,31.0,2.0,-5.0,-1.0,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,1.0,-6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-1.0,-0.5,-1.0,10.5,2.4,-7.0,-8.5,4.6,6.0,5.2,-4.4,-6.0,-2.5,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-4.6,4.5,17.0,0.5,-2.5,-2.5,3.5,-0.5,0.5,-6.0,1.0,1.0,0.5,16.0,3.0,3.0,-5.4,1.0,3.5,1.0,1.0,0.5,1.0,-0.5,-2.4,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-17.5,2.0,-1.0,2.0,1.0,2.0,3.0,5.0,2.0,100.0,3.0,0.5,-4.0],"gain_curve":[15.4,12.0,3.4,-13.5,8.5,1.2999999999999998,-4.0,-2.5,-28.0,20.0,-8.7,17.7,-3.5,-22.0,13.5,-7.6,-8.1,4.6,-8.6,-31.5,-4.0,4.5,1.0,0.0,-1.5,1.0,9.5,-1.0,1.5,1.5,0.0,-1.5,-0.5,8.0,-1.5,0.0,1.0,-1.0,0.0,-93.5,-1.0,1.5,-1.5,0.0,0.0,10.0,-13.0,-4.0,0.0,0.0,-16.5,4.6,0.5,5.5,-7.6,-0.5,-4.2,7.4,10.5,1.5,2.0,0.5,-5.5,13.5,-2.0,-1.5,-0.5,-0.5,-3.4,8.1,-4.0,-17.5,-1.0,8.4,3.5,-5.9,1.5,0.5,3.6,-3.4,-0.5,0.0,-11.1,-2.5,-3.5,4.4,7.0,-4.5,-1.5,0.0,0.0,-2.0,1.5,3.4,-1.0,-2.5,-0.5,0.0,-5.0,-4.0,-25.5,-3.0,0.0,-0.5,0.0,0.0,-1.5,16.5,-1.5,4.0,-4.0,1.0,98.0,-4.0,-1.0,-1.5,-97.0,-2.0,2.5,3.0],"cost_curve":[0.05565375,0.06236325,0.06112275,0.045033,0.06252795,0.0715455,0.0613935,0.057744,0.05986755,0.07274325,0.06605415,0.0801429,0.05996925,0.05717925,0.0601842,0.07312725,0.06972975,0.0627543,0.0774978,0.014682,0.04068345,0.015801,0.10267185,0.05316525,0.006492,0.0,0.09163455,0.06147975,0.06390435,0.0,0.020727,0.021795,0.0,0.06615375,0.041649,0.0,0.0757323,0.07768725,0.053697,0.0,0.05856225,0.011724,0.06197325,0.029727,0.06273225,0.12839805,0.10560915,0.16580625,0.010191,0.043695,0.06324495,0.1005264,0.09278505,0.06947685,0.06918195,0.08224725,0.0929553,0.06327825,0.10762305,0.0599601,0.05922765,0.032964,0.0913221,0.1000329,0.08636295,0.025791,0.0,0.0,0.1098612,0.019524,0.0,0.011955,0.012168,0.0859761,0.11701635,0.0817356,0.042327,0.05231715,0.09535185,0.14171685,0.0,0.0,0.1086198,0.0,0.015867,0.10089465,0.16992015,0.11005155,0.015507,0.040272,0.0,0.1160208,0.123876,0.049863,0.01782,0.0772458,0.0,0.0396,0.018426,0.05052,0.1744707,0.06070425,0.028953,0.0,0.008067,0.043185,0.042651,0.052521,0.0,0.021384,0.0817215,0.0536037,0.08260935,0.022065,0.0674775,0.0,0.025434,0.04137375,0.06116925,0.027999]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","run_index":2,"reward":20.500000000000004,"baseline_reward":317.0,"reference_reward":1138.5,"gain":-296.5,"normalized_reward":-0.11254851228978008,"normalized_gain":-0.36092513694461353,"cost_usd":6.0201585,"latency_seconds":15.87956,"instance_count":120,"reward_curve":[36.0,-3.0,-6.0,5.5,-1.0,2.5,-3.0,-3.0,-11.0,1.0,12.5,-1.0,-1.0,5.5,3.0,1.0,3.0,-1.0,6.0,4.5,0.0,-0.5,-1.0,-2.0,-0.5,0.5,1.0,-1.0,0.5,0.5,-1.0,-0.5,-0.5,-8.0,-1.0,0.5,-1.0,-1.0,-1.0,-3.0,-0.5,10.0,-1.0,-6.0,-0.5,0.5,0.0,-2.0,-0.5,-1.0,2.0,2.5,-6.5,-3.5,-1.0,1.0,5.0,-1.0,4.5,-3.0,0.5,2.5,0.5,-0.5,0.5,1.0,1.0,-11.4,-5.9,2.4,-2.4,5.9,-1.0,-2.4,-0.5,3.0,-1.0,1.0,-1.0,-0.5,1.0,1.0,0.5,1.0,9.0,0.5,0.5,-1.0,-1.0,-1.0,-0.5,-5.2,-0.5,0.5,-1.0,-1.0,1.0,-1.0,3.0,-1.0,0.5,-2.0,-15.0,-1.0,-1.0,-0.5,-1.0,0.5,-1.0,16.0,-1.0,1.0,-0.5,-1.0,-0.5,0.5,-1.0,0.5,1.0,-1.0],"baseline_reward_curve":[-2.4,2.0,-2.4,12.5,-5.0,2.2,3.0,-1.0,34.0,-14.0,5.2,-3.7,4.5,21.0,-8.0,-5.9,4.6,-5.6,5.6,31.0,2.0,-5.0,-1.0,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,1.0,-6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-1.0,-0.5,-1.0,10.5,2.4,-7.0,-8.5,4.6,6.0,5.2,-4.4,-6.0,-2.5,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-4.6,4.5,17.0,0.5,-2.5,-2.5,3.5,-0.5,0.5,-6.0,1.0,1.0,0.5,16.0,3.0,3.0,-5.4,1.0,3.5,1.0,1.0,0.5,1.0,-0.5,-2.4,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-17.5,2.0,-1.0,2.0,1.0,2.0,3.0,5.0,2.0,100.0,3.0,0.5,-4.0],"gain_curve":[38.4,-5.0,-3.6,-7.0,4.0,0.2999999999999998,-6.0,-2.0,-45.0,15.0,7.3,2.7,-5.5,-15.5,11.0,6.9,-1.5999999999999996,4.6,0.40000000000000036,-26.5,-2.0,4.5,0.0,-3.0,-1.5,1.0,0.5,0.0,0.0,1.5,-3.0,-1.0,-1.5,-2.0,-1.5,0.0,1.0,-1.0,-2.0,-97.0,0.5,12.0,-0.5,-7.0,0.5,2.5,-10.0,-1.0,0.0,0.0,-8.5,0.10000000000000009,0.5,5.0,-5.6,-5.0,-0.20000000000000018,3.4000000000000004,10.5,-0.5,-0.5,2.0,0.0,1.5,-0.5,0.0,0.0,-12.4,-6.9,7.0,-6.9,-11.1,-1.5,0.10000000000000009,2.0,-0.5,-0.5,0.5,5.0,-1.5,0.0,0.5,-15.5,-2.0,6.0,5.9,-0.5,-4.5,-2.0,-2.0,-1.0,-6.2,0.0,2.9,-1.5,-1.5,0.0,0.0,-1.0,-4.0,0.0,-4.0,-14.0,-2.0,0.0,-2.5,-1.5,18.0,-3.0,17.0,-3.0,0.0,-2.5,-4.0,-5.5,-1.5,-101.0,-2.5,0.5,3.0],"cost_curve":[0.05369925,0.05418825,0.0582585,0.0863055,0.0565584,0.06158175,0.05887725,0.07013985,0.09640395,0.105222,0.077946,0.06356985,0.06385905,0.0962817,0.07802865,0.0369975,0.07888665,0.0812532,0.0798843,0.0856521,0.08665185,0.007908,0.034821,0.0625905,0.009504,0.0,0.021963,0.03765,0.0,0.0,0.012876,0.011106,0.010635,0.14446845,0.04194,0.0,0.020175,0.019209,0.034215,0.1215336,0.01374,0.1501212,0.02781525,0.09087825,0.011841,0.0,0.0888444,0.07569345,0.014334,0.07477875,0.1185624,0.06066975,0.07885845,0.0819615,0.06765735,0.0939348,0.1213128,0.0998451,0.11235255,0.1133913,0.0,0.05797515,0.0,0.013224,0.0,0.032157,0.028539,0.0816531,0.0848589,0.07208865,0.06893475,0.09660165,0.0795798,0.0557235,0.019905,0.027495,0.046638,0.020046,0.06094035,0.009357,0.024336,0.028833,0.0,0.007908,0.07652475,0.0,0.0,0.06787935,0.0877503,0.09766185,0.010197,0.093663,0.014232,0.0,0.0672258,0.03525675,0.0523035,0.018468,0.04740675,0.018603,0.0,0.05707875,0.18232425,0.058818,0.027669,0.012681,0.012684,0.0,0.0447,0.1111797,0.045318,0.063684,0.012849,0.027711,0.011748,0.0,0.009522,0.0,0.05757975,0.06321375]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","run_index":3,"reward":170.3,"baseline_reward":317.0,"reference_reward":1138.5,"gain":-146.7,"normalized_reward":0.036521046870335375,"normalized_gain":-0.17857577601947655,"cost_usd":6.64316175,"latency_seconds":16.683489,"instance_count":120,"reward_curve":[5.2,6.0,5.3,11.4,-5.2,-13.5,3.0,-3.0,-5.4,2.5,-3.0,1.0,-1.0,1.0,-1.0,-1.0,13.1,-2.4,6.5,13.6,-0.5,-0.5,-1.0,1.0,2.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,0.5,0.5,1.0,0.5,-1.0,-1.0,-2.0,-1.0,2.0,10.0,-0.5,-1.0,-0.5,0.5,-1.0,1.0,-1.0,0.5,1.0,-6.5,-3.0,5.5,4.7,2.2,2.2,-6.5,-2.2,4.5,-1.0,1.0,3.0,0.5,13.5,-1.0,3.0,-3.7,-0.5,-0.5,0.5,-1.0,5.0,0.5,-1.0,1.0,-0.5,-0.5,-0.5,0.5,-0.5,3.0,0.5,0.5,-1.0,-1.0,-0.5,1.0,1.0,0.5,-1.0,-0.5,-0.5,6.5,-1.0,-1.0,-25.0,100.0,-2.0,0.5,0.5,2.0,4.0,4.0,0.5,1.0,5.0,3.0,-1.0,3.0,-1.0,0.5,2.0,5.5,2.0,1.0,-1.0,-2.0,2.0,-4.0,2.0],"baseline_reward_curve":[-2.4,2.0,-2.4,12.5,-5.0,2.2,3.0,-1.0,34.0,-14.0,5.2,-3.7,4.5,21.0,-8.0,-5.9,4.6,-5.6,5.6,31.0,2.0,-5.0,-1.0,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,1.0,-6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-1.0,-0.5,-1.0,10.5,2.4,-7.0,-8.5,4.6,6.0,5.2,-4.4,-6.0,-2.5,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-4.6,4.5,17.0,0.5,-2.5,-2.5,3.5,-0.5,0.5,-6.0,1.0,1.0,0.5,16.0,3.0,3.0,-5.4,1.0,3.5,1.0,1.0,0.5,1.0,-0.5,-2.4,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-17.5,2.0,-1.0,2.0,1.0,2.0,3.0,5.0,2.0,100.0,3.0,0.5,-4.0],"gain_curve":[7.6,4.0,7.699999999999999,-1.0999999999999996,-0.20000000000000018,-15.7,0.0,-2.0,-39.4,16.5,-8.2,4.7,-5.5,-20.0,7.0,4.9,8.5,3.1999999999999997,0.9000000000000004,-17.4,-2.5,4.5,0.0,0.0,1.0,2.5,1.5,0.0,-1.5,0.0,-3.0,0.0,-0.5,7.0,0.0,-1.5,1.0,-2.0,-2.0,-92.0,11.0,1.5,-0.5,-1.5,1.5,1.0,-9.0,0.0,1.0,2.0,-17.0,-5.4,12.5,13.2,-2.3999999999999995,-3.8,-11.7,2.2,10.5,1.5,0.0,2.5,0.0,15.5,-2.0,2.0,-4.7,-1.5,-1.5,5.1,-5.5,-12.0,0.0,1.5,3.5,-4.0,0.0,-1.0,6.5,-1.5,2.0,0.0,-15.5,-4.0,-4.0,4.9,0.0,-2.5,-0.5,-2.0,-1.0,-1.5,7.0,1.4,-1.5,-25.5,99.0,-1.0,-3.5,-2.5,1.5,2.0,5.0,-0.5,2.0,3.0,2.5,16.5,1.0,0.0,-1.5,1.0,3.5,-1.0,-4.0,-3.0,-102.0,-1.0,-4.5,6.0],"cost_curve":[0.04476,0.0574815,0.06369195,0.07652835,0.07460055,0.0818442,0.0690504,0.0623646,0.0903252,0.0768381,0.0612603,0.07552245,0.06112095,0.0671796,0.06845775,0.07332015,0.08576025,0.09476895,0.09097545,0.1449264,0.015492,0.009195,0.0384,0.028503,0.032421,0.038571,0.038916,0.045525,0.06317925,0.0608505,0.039393,0.0,0.0,0.050319,0.0,0.04287,0.045666,0.06654615,0.051243,0.04484925,0.1523094,0.011247,0.022557,0.014559,0.0,0.0488805,0.02427,0.040914,0.0,0.02731875,0.1064259,0.0748698,0.0861756,0.08460885,0.09086715,0.0617478,0.0745776,0.11220525,0.12294465,0.07844505,0.041076,0.033984,0.0,0.12779565,0.07588425,0.07714635,0.05872395,0.031236,0.010944,0.0,0.06324675,0.10430535,0.0,0.0775011,0.12089625,0.014754,0.045714,0.012954,0.0,0.01311,0.08930145,0.0,0.0,0.0948174,0.08305035,0.016989,0.09951885,0.05676,0.0,0.10145355,0.066858,0.015681,0.119202,0.1061586,0.1130502,0.1231641,0.06594825,0.0691704,0.0,0.0,0.03771675,0.037371,0.0961767,0.0,0.029376,0.0642189,0.023667,0.045144,0.0566562,0.033372,0.0,0.039354,0.06324615,0.07715475,0.036183,0.0751161,0.07245945,0.0461016,0.16398195,0.04783185]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"exploitable_poker","run_index":4,"reward":69.80000000000001,"baseline_reward":317.0,"reference_reward":1138.5,"gain":-247.2,"normalized_reward":-0.06348890436859388,"normalized_gain":-0.3009129640900791,"cost_usd":7.88591685,"latency_seconds":20.519944,"instance_count":120,"reward_curve":[14.5,-2.4,3.0,26.0,-7.0,14.0,3.0,2.5,1.0,-1.0,-13.5,-1.0,5.5,14.5,-1.0,-1.0,-3.0,-0.5,2.2,-1.0,-1.0,0.5,9.0,-1.0,-1.0,2.0,2.0,-2.0,-6.5,9.0,-2.0,-0.5,0.5,-0.5,-2.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.5,-0.5,1.0,0.5,0.5,-1.0,-1.0,-0.5,-1.0,-0.5,-2.5,-7.5,-3.5,-3.5,2.5,2.5,-4.2,15.0,2.2,1.0,1.0,3.5,1.0,0.5,-0.5,-5.4,-2.2,-1.0,4.9,0.5,-2.4,1.0,-0.5,-2.4,5.0,1.0,-1.0,0.5,0.5,-1.0,0.5,-1.0,1.0,-1.0,-1.0,0.5,3.0,-1.0,11.5,1.0,0.5,-1.0,2.5,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-2.0,0.5,-3.0,-1.0,-1.0,-2.0,2.0,11.0,0.5,0.5,-1.0,3.0,1.0,1.0,1.0,0.5,2.0,-1.0,-10.0,-2.0,-1.0],"baseline_reward_curve":[-2.4,2.0,-2.4,12.5,-5.0,2.2,3.0,-1.0,34.0,-14.0,5.2,-3.7,4.5,21.0,-8.0,-5.9,4.6,-5.6,5.6,31.0,2.0,-5.0,-1.0,1.0,1.0,-0.5,0.5,-1.0,0.5,-1.0,2.0,0.5,1.0,-6.0,0.5,0.5,-2.0,0.0,1.0,94.0,-1.0,-2.0,-0.5,1.0,-1.0,-2.0,10.0,-1.0,-0.5,-1.0,10.5,2.4,-7.0,-8.5,4.6,6.0,5.2,-4.4,-6.0,-2.5,1.0,0.5,0.5,-2.0,1.0,1.0,1.0,1.0,1.0,-4.6,4.5,17.0,0.5,-2.5,-2.5,3.5,-0.5,0.5,-6.0,1.0,1.0,0.5,16.0,3.0,3.0,-5.4,1.0,3.5,1.0,1.0,0.5,1.0,-0.5,-2.4,0.5,0.5,1.0,-1.0,4.0,3.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-17.5,2.0,-1.0,2.0,1.0,2.0,3.0,5.0,2.0,100.0,3.0,0.5,-4.0],"gain_curve":[16.9,-4.4,5.4,13.5,-2.0,11.8,0.0,3.5,-33.0,13.0,-18.7,2.7,1.0,-6.5,7.0,4.9,-7.6,5.1,-3.3999999999999995,-32.0,-3.0,5.5,10.0,-2.0,-2.0,2.5,1.5,-1.0,-7.0,10.0,-4.0,-1.0,-0.5,5.5,-2.5,-1.5,1.0,-1.0,-2.0,-95.0,1.5,1.5,1.5,-0.5,1.5,1.0,-11.0,0.5,-0.5,0.5,-13.0,-9.9,3.5,5.0,-2.0999999999999996,-3.5,-9.4,19.4,8.2,3.5,0.0,3.0,0.5,2.5,-1.5,-6.4,-3.2,-2.0,3.9000000000000004,5.1,-6.9,-16.0,-1.0,0.10000000000000009,7.5,-2.5,-0.5,0.0,6.5,-2.0,-0.5,-1.5,-15.0,-4.0,-4.0,5.9,2.0,-4.5,10.5,0.0,0.0,-2.0,3.0,1.4,-1.5,0.5,-2.0,0.0,-5.0,-5.0,0.0,-5.0,0.0,-2.0,-1.0,0.0,10.5,18.0,-1.5,0.0,1.0,0.0,-1.0,-2.0,-4.5,0.0,-101.0,-13.0,-2.5,3.0],"cost_curve":[0.035673,0.07689525,0.06892425,0.08151135,0.0761883,0.08248395,0.061992,0.07378455,0.0626715,0.08761215,0.07138935,0.05788425,0.0662358,0.073419,0.0721203,0.06397035,0.06056235,0.015579,0.0773805,0.06665055,0.06010365,0.0,0.10826955,0.0882183,0.052785,0.05840385,0.05868675,0.044667,0.1527945,0.0738636,0.0564765,0.010233,0.0,0.008943,0.0552975,0.040749,0.0812955,0.04992,0.05397,0.052863,0.0,0.011079,0.0313395,0.0,0.0,0.06037425,0.06089625,0.012414,0.06161625,0.015912,0.1188243,0.1387956,0.15202455,0.1219647,0.12464925,0.07743315,0.0746505,0.0868872,0.10243785,0.1317342,0.06884625,0.04773,0.070392,0.0,0.008877,0.16152825,0.087366,0.07933905,0.0767559,0.0,0.0751962,0.06778965,0.013191,0.08266485,0.1246026,0.0810507,0.08708565,0.0,0.0,0.1000317,0.0,0.0925635,0.013188,0.0819663,0.07943115,0.0,0.0353865,0.10167615,0.12288675,0.1069167,0.0,0.10076865,0.1251141,0.10494345,0.1067286,0.05801385,0.08680425,0.10703175,0.09700725,0.1256088,0.0,0.10055175,0.049464,0.020601,0.1051599,0.05790945,0.24101085,0.0,0.0,0.0393315,0.12712185,0.1019163,0.0399675,0.03812175,0.0,0.0393675,0.0758895,0.15400155,0.0640839,0.029436]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","run_index":0,"reward":10.4098,"baseline_reward":4.2825,"reference_reward":12.0,"gain":6.127300000000001,"normalized_reward":0.7516360285504554,"normalized_gain":0.7939488176222871,"cost_usd":4.09962585,"latency_seconds":17.310133,"instance_count":12,"reward_curve":[0.7457,0.6908,0.9052,0.7682,0.8317,0.8802,0.8873,0.928,0.9004,0.9518,0.9661,0.9544],"baseline_reward_curve":[-0.085,0.3923,0.4285,0.5051,0.2898,0.4707,0.7085,0.2477,0.4212,0.2978,0.2664,0.3395],"gain_curve":[0.8307,0.2985,0.4767,0.2631,0.5419,0.4095,0.17879999999999996,0.6803,0.47919999999999996,0.6539999999999999,0.6997,0.6149],"cost_curve":[0.2481309,0.32502315,0.3745497,0.22675695,0.5595645,0.3282453,0.29684325,0.37178295,0.2806194,0.3748044,0.43811655,0.2751888]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","run_index":1,"reward":10.045499999999999,"baseline_reward":4.2825,"reference_reward":12.0,"gain":5.762999999999999,"normalized_reward":0.6947381573398721,"normalized_gain":0.7467444120505343,"cost_usd":3.82458555,"latency_seconds":18.240162,"instance_count":12,"reward_curve":[0.4073,0.7622,0.8857,0.8217,0.8611,0.8886,0.8486,0.8779,0.8583,0.9352,0.9596,0.9393],"baseline_reward_curve":[-0.085,0.3923,0.4285,0.5051,0.2898,0.4707,0.7085,0.2477,0.4212,0.2978,0.2664,0.3395],"gain_curve":[0.4923,0.3699,0.45720000000000005,0.3166,0.5712999999999999,0.41789999999999994,0.1401,0.6302,0.43709999999999993,0.6374,0.6932,0.5998],"cost_curve":[0.33158595,0.4791549,0.30955845,0.3505248,0.34665,0.35869095,0.24589635,0.27840375,0.33171465,0.2521842,0.3159408,0.22428075]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","run_index":2,"reward":9.9574,"baseline_reward":4.2825,"reference_reward":12.0,"gain":5.6749,"normalized_reward":0.6809783372639667,"normalized_gain":0.735328798185941,"cost_usd":3.6620883,"latency_seconds":19.9217,"instance_count":12,"reward_curve":[0.6201,0.7185,0.8295,0.8133,0.7761,0.8512,0.8516,0.9033,0.8438,0.9116,0.929,0.9094],"baseline_reward_curve":[-0.085,0.3923,0.4285,0.5051,0.2898,0.4707,0.7085,0.2477,0.4212,0.2978,0.2664,0.3395],"gain_curve":[0.7051,0.32620000000000005,0.401,0.30820000000000003,0.4863,0.38049999999999995,0.1431,0.6556,0.4226,0.6137999999999999,0.6626000000000001,0.5699],"cost_curve":[0.2626812,0.33474285,0.3624378,0.3182196,0.38754345,0.3293205,0.2648061,0.3057267,0.3506835,0.26198115,0.2800071,0.20393835]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","run_index":3,"reward":9.844499999999998,"baseline_reward":4.2825,"reference_reward":12.0,"gain":5.5619999999999985,"normalized_reward":0.6633451512643102,"normalized_gain":0.7206997084548102,"cost_usd":3.56706915,"latency_seconds":19.443044,"instance_count":12,"reward_curve":[0.4248,0.7708,0.8346,0.8023,0.8507,0.8878,0.8282,0.9245,0.8847,0.854,0.8758,0.9063],"baseline_reward_curve":[-0.085,0.3923,0.4285,0.5051,0.2898,0.4707,0.7085,0.2477,0.4212,0.2978,0.2664,0.3395],"gain_curve":[0.5098,0.37850000000000006,0.4061,0.2972,0.5609,0.4171,0.11970000000000003,0.6768,0.4635,0.5562,0.6093999999999999,0.5668],"cost_curve":[0.3056229,0.38533965,0.2666202,0.3533058,0.30725445,0.3181881,0.19611585,0.26426115,0.2965725,0.31808115,0.3088131,0.2468943]},{"run_name":"icl-notepad-claude-sonnet-4-6","task":"sales_prediction","run_index":4,"reward":10.106200000000001,"baseline_reward":4.2825,"reference_reward":12.0,"gain":5.823700000000001,"normalized_reward":0.7042185328064725,"normalized_gain":0.7546096533851637,"cost_usd":3.6114156,"latency_seconds":18.188303,"instance_count":12,"reward_curve":[0.7337,0.779,0.834,0.8091,0.8094,0.7971,0.8172,0.8948,0.8546,0.9183,0.9145,0.9445],"baseline_reward_curve":[-0.085,0.3923,0.4285,0.5051,0.2898,0.4707,0.7085,0.2477,0.4212,0.2978,0.2664,0.3395],"gain_curve":[0.8187,0.38670000000000004,0.40549999999999997,0.30400000000000005,0.5196000000000001,0.3264,0.10870000000000002,0.6471,0.4334,0.6205,0.6480999999999999,0.605],"cost_curve":[0.3058917,0.31961535,0.26270865,0.27264945,0.31381395,0.30024375,0.29577705,0.28259625,0.3620223,0.29704125,0.29410275,0.30495315]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":0,"reward":38.428299999999965,"baseline_reward":19.7597,"reference_reward":90.0,"gain":18.668599999999966,"normalized_reward":0.26576830535742196,"normalized_gain":0.2657818944395164,"cost_usd":1.739498,"latency_seconds":11.850914,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3669,0.3357,0.3306,0.3279,0.3274,0.3437,0.3192,0.3621,0.3704,0.3659,0.4692,0.4458,0.462,0.454,0.4568,0.4489,0.5425,0.5845,0.4893,0.5743,0.564,0.4607,0.4853,0.3259,0.1405,0.4388,0.5022,0.1606,0.466,0.369,0.2125,0.4678,0.4555,0.5184,0.4668,0.4512,0.4769,0.4769,0.4364,0.4153,0.4338,0.3988,0.4595,0.4696,0.479,0.479,0.479,0.479,0.479,0.4765,0.479,0.479,0.479,0.479,0.479,0.4345,0.4345,0.4345,0.4517,0.4702,0.4702,0.4945,0.4542,0.4959,0.5154,0.5154,0.4696,0.4832,0.486,0.486,0.486,0.4051,0.4051,0.4051,0.4051,0.4051,0.3698,0.3826,0.3465,0.3465,0.3465,0.3465,0.3938,0.3938,0.4388,0.4523,0.4523],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1405,0.1116,0.11780000000000002,0.10060000000000002,0.13240000000000002,0.12160000000000001,0.10659999999999997,0.12169999999999997,0.1419,0.1466,0.2209,0.25379999999999997,0.26460000000000006,0.23010000000000003,0.22979999999999998,0.24240000000000003,0.2951,0.38270000000000004,0.2874,0.36130000000000007,0.3556999999999999,0.2363,0.252,0.11650000000000002,-0.06999999999999998,0.20760000000000003,0.295,-0.037599999999999995,0.25750000000000006,0.1595,0.009800000000000003,0.2443,0.2416,0.3155,0.2254,0.2539,0.2566,0.2505,0.24380000000000002,0.1756,0.21220000000000003,0.17149999999999999,0.23210000000000003,0.24810000000000001,0.2481,0.24569999999999997,0.25029999999999997,0.2613,0.25749999999999995,0.269,0.2663,0.25439999999999996,0.25379999999999997,0.2792,0.24289999999999998,0.239,0.21889999999999998,0.1926,0.24029999999999999,0.25360000000000005,0.2492,0.2964,0.2387,0.2687,0.2602,0.3066,0.2484,0.22910000000000003,0.27249999999999996,0.23879999999999998,0.2557,0.18430000000000002,0.16740000000000002,0.16290000000000002,0.1922,0.15630000000000002,0.17010000000000003,0.1747,0.1289,0.1299,0.13639999999999997,0.12719999999999998,0.1934,0.19419999999999998,0.23710000000000003,0.20809999999999998,0.23009999999999997],"cost_curve":[0.009742,0.010314,0.015692,0.020488,0.017118,0.016524,0.01903,0.018894,0.022488,0.022364,0.024718,0.019976,0.019338,0.020912,0.02013,0.023538,0.025332,0.020202,0.021514,0.0285,0.023932,0.026372,0.024776,0.030174,0.019186,0.01648,0.017796,0.016768,0.016262,0.019662,0.02069,0.016828,0.015842,0.014436,0.015854,0.015228,0.020572,0.01898,0.016784,0.019238,0.0204,0.018254,0.01733,0.019872,0.017248,0.019694,0.018108,0.022646,0.016436,0.015786,0.017206,0.01732,0.016314,0.016676,0.017808,0.01814,0.01746,0.01494,0.019444,0.017962,0.018926,0.019672,0.022886,0.017192,0.01948,0.019248,0.017852,0.019178,0.017716,0.018382,0.0184,0.017944,0.022172,0.01732,0.02132,0.020056,0.019082,0.020556,0.01834,0.02076,0.021652,0.019994,0.020172,0.019236,0.02274,0.021244,0.02217,0.025132,0.018798,0.02016]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":1,"reward":28.529200000000003,"baseline_reward":19.7597,"reference_reward":90.0,"gain":8.769500000000004,"normalized_reward":0.12483378180213271,"normalized_gain":0.12484997928539604,"cost_usd":2.285872,"latency_seconds":15.25851,"instance_count":90,"reward_curve":[0.2072,0.2103,0.2379,0.2114,0.2148,0.2148,0.2475,0.2603,0.2481,0.3554,0.2635,0.2364,0.2222,0.2369,0.2439,0.2149,0.2418,0.2452,0.2079,0.3234,0.3035,0.3496,0.3269,0.3493,0.2627,0.327,0.3333,0.3475,0.3333,0.3497,0.4022,0.4087,0.4022,0.377,0.407,0.3528,0.377,0.414,0.377,0.377,0.3626,0.4022,0.3676,0.3255,0.2417,0.3848,0.39,0.3072,0.4392,0.4299,0.3812,0.3252,0.3694,0.3468,0.3521,0.3384,0.2829,0.3729,0.4028,0.322,0.2993,0.3172,0.362,0.2794,0.4198,0.3424,0.3412,0.3333,0.2387,0.2373,0.2315,0.2113,0.3842,0.2075,0.344,0.372,0.3956,0.4091,0.4242,0.2279,0.3318,0.2172,0.3413,0.3275,0.4366,0.2065,0.2134,0.2378,0.305,0.3683],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.03790000000000002,0.0262,-0.014999999999999986,-0.009300000000000003,0.0020000000000000018,0.020199999999999996,0.06529999999999997,0.025999999999999995,0.14279999999999998,0.02310000000000001,0.00789999999999999,0.0029000000000000137,-0.011399999999999993,0.0519,0.017500000000000016,0.0179,0.018199999999999994,0.0014000000000000123,0.07600000000000001,0.10169999999999998,0.14770000000000003,0.11390000000000003,0.141,0.0383,0.0937,0.12389999999999998,0.13699999999999998,0.1021,0.14250000000000002,0.20400000000000001,0.20020000000000002,0.1927,0.1743,0.18349999999999997,0.1389,0.1741,0.17259999999999998,0.1797,0.1567,0.1362,0.2096,0.12789999999999999,0.10390000000000002,0.014399999999999996,0.15739999999999998,0.1685,0.07629999999999998,0.20589999999999997,0.20120000000000002,0.16349999999999998,0.10369999999999999,0.16190000000000002,0.1341,0.12750000000000003,0.11319999999999997,0.08309999999999998,0.1368,0.20729999999999998,0.1064,0.05740000000000001,0.10579999999999998,0.1454,0.05839999999999998,0.2217,0.12689999999999999,0.11399999999999999,0.0781,0.029899999999999982,0.016100000000000003,-0.02259999999999998,-0.0022000000000000075,0.13699999999999998,-0.022800000000000015,0.12319999999999998,0.1343,0.1534,0.1962,0.17540000000000003,0.028200000000000003,0.12389999999999998,-0.0003999999999999837,0.1247,0.1174,0.2173,0.006099999999999994,0.013800000000000007,0.03610000000000002,0.06079999999999999,0.1461],"cost_curve":[0.009518,0.014004,0.018886,0.015784,0.01601,0.014588,0.015308,0.014648,0.01502,0.016878,0.018622,0.018512,0.022724,0.01734,0.019732,0.018478,0.021038,0.019222,0.022,0.022116,0.022076,0.025542,0.020226,0.022034,0.022814,0.024296,0.024886,0.023696,0.021686,0.023674,0.02667,0.024548,0.024868,0.022696,0.024,0.025002,0.023074,0.0237,0.028396,0.030184,0.02462,0.023068,0.023732,0.025,0.027038,0.021994,0.025918,0.022396,0.025546,0.02498,0.028044,0.029328,0.030254,0.027088,0.030742,0.025366,0.02584,0.02837,0.028026,0.02501,0.02548,0.026122,0.031268,0.028504,0.031812,0.028642,0.028546,0.032012,0.026804,0.032124,0.027788,0.028188,0.03109,0.031552,0.029676,0.031164,0.027214,0.035094,0.029062,0.02994,0.032134,0.032202,0.036294,0.03005,0.032784,0.036786,0.036068,0.034932,0.035012,0.030672]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":2,"reward":21.443500000000007,"baseline_reward":19.7597,"reference_reward":90.0,"gain":1.6838000000000086,"normalized_reward":0.023953928729053723,"normalized_gain":0.02397199328590579,"cost_usd":2.745924,"latency_seconds":18.27249,"instance_count":90,"reward_curve":[0.2482,0.2864,0.2676,0.23,0.2465,0.2138,0.2278,0.2087,0.2417,0.2932,0.3035,0.242,0.2568,0.3,0.2157,0.2663,0.2121,0.196,0.2145,0.2144,0.1787,0.1787,0.1787,0.2237,0.1917,0.182,0.2016,0.191,0.2391,0.2104,0.2133,0.2334,0.205,0.2761,0.2484,0.2925,0.2366,0.2803,0.2863,0.3548,0.3472,0.3492,0.3367,0.2782,0.2861,0.2868,0.3496,0.3237,0.3002,0.3078,0.2482,0.2378,0.337,0.2586,0.2213,0.2473,0.2372,0.2311,0.253,0.2211,0.2648,0.2517,0.2346,0.3102,0.2626,0.1593,0.2274,0.2539,0.1792,0.2063,0.2017,0.1953,0.1548,0.1273,0.192,0.192,0.2093,0.2059,0.1262,0.2006,0.221,0.1792,0.221,0.243,0.1792,0.192,0.2204,0.2342,0.2194,0.2334],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.038199999999999984,0.055900000000000005,0.00360000000000002,0.022400000000000003,0.0010000000000000009,0.0005000000000000004,0.01369999999999999,0.019600000000000006,0.0806,0.06309999999999999,0.013499999999999984,0.03749999999999998,0.051699999999999996,0.0237,0.06889999999999999,-0.011799999999999977,-0.031,0.008000000000000007,-0.033,-0.02310000000000001,-0.0232,-0.0343,0.015399999999999997,-0.03269999999999998,-0.05130000000000001,-0.007800000000000001,-0.01949999999999999,0.007900000000000018,0.0032000000000000084,0.015100000000000002,0.024900000000000005,-0.004500000000000004,0.07340000000000002,0.024900000000000005,0.07859999999999998,0.03370000000000001,0.03889999999999999,0.089,0.1345,0.12080000000000002,0.15660000000000002,0.097,0.05660000000000001,0.05880000000000002,0.05940000000000001,0.12810000000000002,0.0928,0.06690000000000002,0.07910000000000003,0.0305,0.01630000000000001,0.12950000000000003,0.045899999999999996,-0.0032999999999999974,0.02209999999999998,0.03739999999999999,-0.0050000000000000044,0.057499999999999996,0.005499999999999977,0.022899999999999976,0.040299999999999975,0.018000000000000016,0.08919999999999997,0.0645,-0.0562,0.00019999999999997797,-0.0012999999999999678,-0.029600000000000015,-0.014899999999999997,-0.0524,-0.018199999999999994,-0.09240000000000001,-0.10300000000000001,-0.028799999999999992,-0.04569999999999999,-0.032899999999999985,-0.007000000000000006,-0.12259999999999999,0.0009000000000000119,0.0131,-0.03839999999999999,0.004400000000000015,0.032899999999999985,-0.0401,-0.00839999999999999,0.020800000000000013,0.0325,-0.02479999999999999,0.011199999999999988],"cost_curve":[0.011214,0.011848,0.012294,0.013402,0.019128,0.017208,0.018444,0.0203,0.01927,0.01816,0.019226,0.02002,0.020646,0.021708,0.023574,0.02345,0.025346,0.023164,0.026204,0.020644,0.023284,0.027548,0.021178,0.023332,0.026122,0.02523,0.023922,0.023646,0.022594,0.023544,0.025026,0.028564,0.027224,0.031144,0.027154,0.027832,0.026024,0.030548,0.033644,0.031804,0.030708,0.029928,0.031108,0.03388,0.030428,0.03595,0.034044,0.03489,0.028942,0.034674,0.02693,0.033576,0.030182,0.024646,0.029412,0.03387,0.037498,0.035156,0.037296,0.033384,0.030808,0.041698,0.039668,0.03928,0.038016,0.040744,0.03752,0.037756,0.03873,0.042406,0.03517,0.035876,0.041964,0.039688,0.039998,0.03302,0.044144,0.033918,0.03735,0.04045,0.038462,0.040816,0.038082,0.045012,0.041224,0.048212,0.035198,0.03899,0.044726,0.045882]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":3,"reward":32.77910000000001,"baseline_reward":19.7597,"reference_reward":90.0,"gain":13.019400000000008,"normalized_reward":0.18534005324677177,"normalized_gain":0.18535513088640007,"cost_usd":2.40832,"latency_seconds":15.769983,"instance_count":90,"reward_curve":[0.192,0.2393,0.2297,0.2426,0.2509,0.2517,0.2533,0.2977,0.3079,0.3314,0.3362,0.3362,0.3362,0.348,0.3415,0.3407,0.3407,0.3532,0.3439,0.3444,0.3462,0.3653,0.3653,0.3767,0.3767,0.3767,0.4311,0.4356,0.4054,0.3835,0.3835,0.3835,0.4202,0.4202,0.4227,0.4177,0.4177,0.4413,0.4225,0.4225,0.4225,0.4779,0.4779,0.4779,0.4271,0.4907,0.4907,0.4907,0.4907,0.4499,0.3894,0.3894,0.3894,0.3894,0.3894,0.3894,0.3894,0.3894,0.3894,0.3894,0.3806,0.3806,0.3806,0.3697,0.3671,0.3671,0.3671,0.3671,0.3671,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243,0.3243],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.008899999999999991,0.017999999999999988,0.01620000000000002,0.026800000000000018,0.03889999999999999,0.026000000000000023,0.10270000000000001,0.08580000000000002,0.11879999999999996,0.0958,0.10769999999999999,0.1169,0.09969999999999998,0.14950000000000002,0.1433,0.11680000000000001,0.1262,0.1374,0.09699999999999998,0.1444,0.16340000000000002,0.15230000000000002,0.16839999999999997,0.1523,0.14339999999999997,0.22169999999999998,0.2251,0.1742,0.1763,0.18530000000000002,0.17500000000000002,0.21070000000000003,0.21750000000000003,0.19920000000000002,0.2038,0.21480000000000002,0.19990000000000002,0.22519999999999998,0.2022,0.1961,0.2853,0.2382,0.2563,0.19979999999999998,0.26330000000000003,0.2692,0.25980000000000003,0.2574,0.22120000000000004,0.17170000000000002,0.16790000000000002,0.18190000000000003,0.17670000000000002,0.16480000000000003,0.1642,0.18960000000000002,0.15330000000000002,0.19390000000000002,0.1738,0.1387,0.1692,0.164,0.14869999999999997,0.16899999999999998,0.15159999999999998,0.13989999999999997,0.1119,0.15829999999999997,0.10309999999999997,0.07019999999999998,0.11079999999999998,0.07709999999999997,0.09399999999999997,0.10349999999999998,0.08659999999999998,0.08209999999999998,0.11139999999999997,0.07549999999999998,0.12459999999999999,0.11639999999999998,0.10669999999999999,0.10769999999999999,0.11419999999999997,0.10499999999999998,0.12389999999999998,0.12469999999999998,0.12259999999999999,0.08009999999999998,0.10209999999999997],"cost_curve":[0.008092,0.012168,0.017232,0.019556,0.02167,0.017588,0.015974,0.019066,0.017844,0.02034,0.020128,0.018746,0.02295,0.0224,0.028422,0.026148,0.025832,0.025012,0.027,0.028936,0.028052,0.033126,0.029906,0.031858,0.032432,0.031486,0.035144,0.034726,0.030262,0.038348,0.035792,0.03696,0.041604,0.045252,0.04117,0.04025,0.039228,0.042078,0.044136,0.04405,0.042766,0.04442,0.046112,0.021464,0.024458,0.019666,0.017412,0.022584,0.019408,0.02214,0.02348,0.024122,0.02098,0.025276,0.019104,0.022022,0.019822,0.02304,0.018914,0.026864,0.027264,0.021428,0.02534,0.026502,0.02293,0.022152,0.025504,0.030148,0.025012,0.030706,0.02378,0.023628,0.026444,0.022452,0.024054,0.024552,0.027548,0.023868,0.02414,0.024246,0.024902,0.024586,0.025052,0.026798,0.023666,0.026478,0.023916,0.023738,0.026428,0.02604]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"blind_spectrum_monitoring","run_index":4,"reward":24.42829999999999,"baseline_reward":19.7597,"reference_reward":90.0,"gain":4.668599999999991,"normalized_reward":0.06644883896410807,"normalized_gain":0.06646611702968225,"cost_usd":4.80056,"latency_seconds":29.462859,"instance_count":90,"reward_curve":[0.2273,0.2562,0.3134,0.2678,0.2047,0.258,0.2485,0.2367,0.2157,0.228,0.2784,0.2181,0.2724,0.2557,0.2354,0.2721,0.2445,0.2488,0.2146,0.2146,0.3167,0.3442,0.3193,0.3302,0.3869,0.3536,0.42,0.4024,0.44,0.4049,0.3957,0.3753,0.3443,0.3715,0.3869,0.4184,0.3948,0.4357,0.415,0.2824,0.3846,0.39,0.3469,0.3149,0.2632,0.2559,0.2559,0.2559,0.2632,0.2559,0.2559,0.2559,0.2559,0.2573,0.262,0.2589,0.2589,0.2589,0.2589,0.2589,0.2636,0.2636,0.2636,0.2788,0.2636,0.2636,0.2636,0.2636,0.1727,0.1879,0.1879,0.203,0.1879,0.1879,0.1879,0.1879,0.1879,0.1996,0.2708,0.1879,0.1879,0.1879,0.203,0.1879,0.1879,0.1879,0.1879,0.1879,0.1879,0.2318],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2135,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.00799999999999998,0.10170000000000001,0.04139999999999999,-0.0194,0.04520000000000002,0.021199999999999997,0.04169999999999999,-0.006399999999999989,0.015399999999999997,0.03799999999999998,-0.01040000000000002,0.05309999999999998,0.00739999999999999,0.043399999999999994,0.07470000000000002,0.020600000000000007,0.021799999999999986,0.008100000000000024,-0.032799999999999996,0.11489999999999997,0.1423,0.10629999999999998,0.12189999999999998,0.16250000000000003,0.12030000000000002,0.21059999999999998,0.1919,0.2088,0.1977,0.1975,0.16680000000000003,0.1348,0.1688,0.16340000000000002,0.2045,0.1919,0.19429999999999997,0.21769999999999998,0.06209999999999999,0.1582,0.19740000000000002,0.10719999999999999,0.09330000000000002,0.03589999999999999,0.028500000000000025,0.034400000000000014,0.025000000000000022,0.029899999999999982,0.02720000000000003,0.03820000000000001,0.034400000000000014,0.048400000000000026,0.04459999999999997,0.03740000000000002,0.03370000000000001,0.059100000000000014,0.022800000000000015,0.06340000000000001,0.043300000000000005,0.021699999999999997,0.052199999999999996,0.047000000000000014,0.05779999999999999,0.0655,0.048100000000000004,0.03639999999999999,0.008400000000000019,-0.03610000000000002,-0.033299999999999996,-0.06619999999999998,-0.010499999999999982,-0.05929999999999999,-0.04239999999999999,-0.032899999999999985,-0.04979999999999998,-0.05429999999999999,-0.013300000000000006,0.021999999999999992,-0.011799999999999977,-0.01999999999999999,-0.029699999999999976,-0.013599999999999973,-0.022199999999999998,-0.031399999999999983,-0.012499999999999983,-0.011699999999999988,-0.013799999999999979,-0.05629999999999999,0.009599999999999997],"cost_curve":[0.007898,0.016462,0.016574,0.020852,0.021568,0.02437,0.022708,0.023462,0.02978,0.02437,0.025374,0.02278,0.023128,0.025662,0.027716,0.026038,0.0249,0.027892,0.025728,0.027218,0.029464,0.026156,0.024702,0.021456,0.02767,0.02655,0.030016,0.024862,0.032844,0.029634,0.03422,0.032032,0.033162,0.034458,0.032718,0.036076,0.036212,0.038706,0.040416,0.04054,0.040022,0.046734,0.046494,0.047626,0.049578,0.045878,0.046546,0.047348,0.050708,0.056252,0.05437,0.058524,0.04764,0.049836,0.055458,0.068506,0.05674,0.056524,0.073172,0.067608,0.05995,0.07413,0.061748,0.047466,0.073796,0.07782,0.083346,0.07409,0.08443,0.087656,0.078496,0.075228,0.09587,0.081218,0.085528,0.095764,0.08964,0.033036,0.078884,0.101234,0.112796,0.111134,0.098204,0.12752,0.092082,0.104464,0.104336,0.115338,0.117856,0.115562]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":0,"reward":6.049999999999999,"baseline_reward":8.425,"reference_reward":19.0,"gain":-2.3750000000000018,"normalized_reward":-0.35602094240837695,"normalized_gain":-0.22458628841607584,"cost_usd":3.5764382,"latency_seconds":4.345913,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.75,0.65,0.0,0.575,0.0,0.625,0.0,0.875,0.0,0.925,0.925,0.125,0.0,0.0,0.0,0.6],"baseline_reward_curve":[0.0,0.0,0.0,0.725,0.725,0.0,0.65,0.775,0.725,0.5,0.9,0.325,0.85,0.875,0.0,0.0,0.75,0.125,0.5],"gain_curve":[0.0,0.0,0.0,0.025000000000000022,-0.07499999999999996,0.0,-0.07500000000000007,-0.775,-0.09999999999999998,-0.5,-0.025000000000000022,-0.325,0.07500000000000007,0.050000000000000044,0.125,0.0,-0.75,-0.125,0.09999999999999998],"cost_curve":[0.052606,0.2658906,0.2282992,0.08883,0.1721018,0.096796,0.2180888,0.133424,0.1820828,0.3027854,0.047978,0.4653246,0.02898,0.060976,0.4549446,0.2384632,0.055958,0.204728,0.2781812]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":1,"reward":8.65,"baseline_reward":8.425,"reference_reward":19.0,"gain":0.22499999999999964,"normalized_reward":-0.08376963350785328,"normalized_gain":0.02127659574468082,"cost_usd":3.3745658,"latency_seconds":4.291236,"instance_count":19,"reward_curve":[0.875,0.725,0.7,0.0,0.7,0.0,0.625,0.6,0.0,0.375,0.625,0.0,0.0,0.0,0.725,0.775,0.575,0.65,0.7],"baseline_reward_curve":[0.0,0.0,0.0,0.725,0.725,0.0,0.65,0.775,0.725,0.5,0.9,0.325,0.85,0.875,0.0,0.0,0.75,0.125,0.5],"gain_curve":[0.875,0.725,0.7,-0.725,-0.025000000000000022,0.0,-0.025000000000000022,-0.17500000000000004,-0.725,-0.125,-0.275,-0.325,-0.85,-0.875,0.725,0.775,-0.17500000000000004,0.525,0.19999999999999996],"cost_curve":[0.057286,0.1123762,0.163308,0.3610266,0.090308,0.191608,0.2505856,0.2037136,0.124958,0.2944278,0.1905494,0.2319292,0.1979184,0.087346,0.123368,0.137806,0.1991894,0.2292756,0.127586]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":2,"reward":8.675,"baseline_reward":8.425,"reference_reward":19.0,"gain":0.25,"normalized_reward":-0.08115183246073283,"normalized_gain":0.023640661938534282,"cost_usd":3.458414,"latency_seconds":4.203328,"instance_count":19,"reward_curve":[0.75,0.675,0.0,0.0,0.725,0.675,0.45,0.0,0.75,0.0,0.0,0.0,0.9,0.175,0.8,0.675,0.775,0.35,0.975],"baseline_reward_curve":[0.0,0.0,0.0,0.725,0.725,0.0,0.65,0.775,0.725,0.5,0.9,0.325,0.85,0.875,0.0,0.0,0.75,0.125,0.5],"gain_curve":[0.75,0.675,0.0,-0.725,0.0,0.675,-0.2,-0.775,0.025000000000000022,-0.5,-0.9,-0.325,0.050000000000000044,-0.7,0.8,0.675,0.025000000000000022,0.22499999999999998,0.475],"cost_curve":[0.102982,0.201427,0.194826,0.123088,0.114994,0.130198,0.2750376,0.2062618,0.146444,0.22101,0.1584186,0.366153,0.042714,0.3614812,0.097716,0.139206,0.182162,0.3777008,0.016594]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":3,"reward":6.1,"baseline_reward":8.425,"reference_reward":19.0,"gain":-2.325000000000001,"normalized_reward":-0.35078534031413605,"normalized_gain":-0.21985815602836892,"cost_usd":3.4581188,"latency_seconds":4.395596,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.75,0.0,0.0,0.725,0.7,0.4,0.45,0.75,0.925,0.0,0.0,0.0,0.0,0.475,0.0,0.925],"baseline_reward_curve":[0.0,0.0,0.0,0.725,0.725,0.0,0.65,0.775,0.725,0.5,0.9,0.325,0.85,0.875,0.0,0.0,0.75,0.125,0.5],"gain_curve":[0.0,0.0,0.0,0.025000000000000022,-0.725,0.0,0.07499999999999996,-0.07500000000000007,-0.32499999999999996,-0.04999999999999999,-0.15000000000000002,0.6000000000000001,-0.85,-0.875,0.0,0.0,-0.275,-0.125,0.42500000000000004],"cost_curve":[0.080834,0.04168,0.1738528,0.162132,0.136984,0.161474,0.08831,0.143296,0.2671746,0.3488436,0.113102,0.042368,0.2509656,0.203811,0.3672322,0.2821658,0.4597172,0.103948,0.030228]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"codebase_adaptation","run_index":4,"reward":6.075,"baseline_reward":8.425,"reference_reward":19.0,"gain":-2.3500000000000005,"normalized_reward":-0.35340314136125645,"normalized_gain":-0.2222222222222223,"cost_usd":3.6637224,"latency_seconds":4.203634,"instance_count":19,"reward_curve":[0.7,0.0,0.0,0.0,0.0,0.45,0.0,0.675,0.0,0.625,0.75,0.9,0.0,0.075,0.0,0.75,0.6,0.0,0.55],"baseline_reward_curve":[0.0,0.0,0.0,0.725,0.725,0.0,0.65,0.775,0.725,0.5,0.9,0.325,0.85,0.875,0.0,0.0,0.75,0.125,0.5],"gain_curve":[0.7,0.0,0.0,-0.725,-0.725,0.45,-0.65,-0.09999999999999998,-0.725,0.125,-0.15000000000000002,0.575,-0.85,-0.8,0.0,0.75,-0.15000000000000002,-0.125,0.050000000000000044],"cost_curve":[0.098236,0.11809,0.3706862,0.029494,0.2338218,0.2995708,0.05037,0.15153,0.1987834,0.207157,0.120286,0.046834,0.2385194,0.395511,0.2009662,0.085466,0.2593004,0.2334558,0.3256444]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","run_index":0,"reward":0.23979999999999999,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain":-0.4526999999999999,"normalized_reward":-0.33549081145478626,"normalized_gain":-0.17742226263354674,"cost_usd":1.455084,"latency_seconds":6.63238,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.1758,0.0,0.0,0.0329,0.0001,0.0,0.0,0.0307,0.0,0.0,0.0,0.0,0.0002,0.0,0.0001],"baseline_reward_curve":[0.0,0.1769,0.0,0.0261,0.0,0.0631,0.0904,0.0001,0.0,0.0,0.0002,0.0905,0.0,0.0,0.0,0.0355,0.2095,0.0002,0.0,0.0],"gain_curve":[0.0,-0.1769,0.0,-0.0261,0.0,0.11270000000000001,-0.0904,-0.0001,0.0329,0.0001,-0.0002,-0.0905,0.0307,0.0,0.0,-0.0355,-0.2095,0.0,0.0,0.0001],"cost_curve":[0.060708,0.086838,0.059668,0.061408,0.055684,0.087978,0.084554,0.07018,0.068472,0.083824,0.063622,0.06235,0.059286,0.059206,0.06064,0.086216,0.083478,0.084348,0.09058,0.086044]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","run_index":1,"reward":0.6565,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain":-0.03599999999999992,"normalized_reward":-0.15025294060118954,"normalized_gain":-0.014109126253164723,"cost_usd":1.657314,"latency_seconds":5.860379,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.2488,0.0,0.0,0.0001,0.0,0.0,0.0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0386,0.364,0.0048],"baseline_reward_curve":[0.0,0.1769,0.0,0.0261,0.0,0.0631,0.0904,0.0001,0.0,0.0,0.0002,0.0905,0.0,0.0,0.0,0.0355,0.2095,0.0002,0.0,0.0],"gain_curve":[0.0,-0.1769,0.0,-0.0261,0.2488,-0.0631,-0.0904,0.0,0.0,0.0,0.0,-0.0905,0.0,0.0,0.0,-0.0355,-0.2095,0.038400000000000004,0.364,0.0048],"cost_curve":[0.070046,0.106296,0.08378,0.073518,0.116036,0.07335,0.089168,0.088444,0.07174,0.056414,0.073418,0.082072,0.085474,0.093938,0.08009,0.104078,0.077398,0.083504,0.079518,0.069032]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","run_index":2,"reward":0.1941,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain":-0.4983999999999999,"normalized_reward":-0.35580607590885244,"normalized_gain":-0.195333014571592,"cost_usd":1.653484,"latency_seconds":6.193366,"instance_count":20,"reward_curve":[0.0,0.0731,0.0,0.0142,0.0,0.0,0.0,0.0,0.1065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0002,0.0001,0.0],"baseline_reward_curve":[0.0,0.1769,0.0,0.0261,0.0,0.0631,0.0904,0.0001,0.0,0.0,0.0002,0.0905,0.0,0.0,0.0,0.0355,0.2095,0.0002,0.0,0.0],"gain_curve":[0.0,-0.1038,0.0,-0.0119,0.0,-0.0631,-0.0904,-0.0001,0.1065,0.0,-0.0002,-0.0905,0.0,0.0,0.0,-0.0355,-0.2095,0.0,0.0001,0.0],"cost_curve":[0.069222,0.076908,0.075348,0.08913,0.073282,0.07859,0.107918,0.10315,0.0941,0.062824,0.074274,0.061098,0.088812,0.081558,0.094996,0.08215,0.085442,0.089042,0.083426,0.082214]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","run_index":3,"reward":0.2069,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain":-0.4855999999999999,"normalized_reward":-0.350116023720405,"normalized_gain":-0.19031643634824455,"cost_usd":1.33938,"latency_seconds":7.341778,"instance_count":20,"reward_curve":[0.0,0.0142,0.1778,0.0,0.0,0.0,0.0,0.0,0.0146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0002,0.0,0.0],"baseline_reward_curve":[0.0,0.1769,0.0,0.0261,0.0,0.0631,0.0904,0.0001,0.0,0.0,0.0002,0.0905,0.0,0.0,0.0,0.0355,0.2095,0.0002,0.0,0.0],"gain_curve":[0.0,-0.1627,0.1778,-0.0261,0.0,-0.0631,-0.0904,-0.0001,0.0146,0.0,-0.0002,-0.0905,0.0,0.0,0.0,-0.0355,-0.2094,0.0,0.0,0.0],"cost_curve":[0.060598,0.072286,0.078778,0.081796,0.054404,0.054134,0.055096,0.069328,0.07376,0.061102,0.059466,0.060148,0.055078,0.053654,0.062942,0.05464,0.099416,0.084092,0.074494,0.074168]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"cohort_studies","run_index":4,"reward":0.3358,"baseline_reward":0.6924999999999999,"reference_reward":3.24404,"gain":-0.3566999999999999,"normalized_reward":-0.2928154200414307,"normalized_gain":-0.13979792595844073,"cost_usd":1.55163,"latency_seconds":6.239967,"instance_count":20,"reward_curve":[0.09,0.0,0.0,0.0,0.0001,0.0,0.0216,0.0,0.0001,0.0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0002,0.0001,0.2235],"baseline_reward_curve":[0.0,0.1769,0.0,0.0261,0.0,0.0631,0.0904,0.0001,0.0,0.0,0.0002,0.0905,0.0,0.0,0.0,0.0355,0.2095,0.0002,0.0,0.0],"gain_curve":[0.09,-0.1769,0.0,-0.0261,0.0001,-0.0631,-0.0688,-0.0001,0.0001,0.0002,-0.0002,-0.0905,0.0,0.0,0.0,-0.0355,-0.2095,0.0,0.0001,0.2235],"cost_curve":[0.081804,0.088296,0.081798,0.075208,0.068374,0.075224,0.087184,0.07652,0.073894,0.082248,0.082702,0.08477,0.074874,0.072938,0.059936,0.08679,0.06086,0.071924,0.082546,0.08374]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","run_index":0,"reward":9.8,"baseline_reward":4.2,"reference_reward":40.0,"gain":5.6000000000000005,"normalized_reward":0.12379110251450681,"normalized_gain":0.15642458100558662,"cost_usd":2.255034,"latency_seconds":4.473307,"instance_count":40,"reward_curve":[0.6,0.0,0.0,0.0,0.0,0.2666666666666667,0.6,0.0,0.4,0.0,0.06666666666666665,0.6666666666666667,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8666666666666667,0.6666666666666667,0.6666666666666667,0.0,0.0,0.0,0.5333333333333333,0.4666666666666667,0.33333333333333337,0.7333333333333334,0.4,0.4,0.5333333333333333],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.0,0.6666666666666667,0.0,0.33333333333333337,0.1333333333333333,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.33333333333333337,0.4,0.0],"gain_curve":[0.1333333333333333,0.0,0.0,0.0,0.0,0.06666666666666676,0.1333333333333333,0.0,-0.2666666666666667,0.0,-0.2666666666666667,0.5333333333333334,0.0,-0.06666666666666665,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.8666666666666667,0.6666666666666667,0.6666666666666667,0.0,0.0,0.0,0.5333333333333333,0.4666666666666667,0.13333333333333341,0.2666666666666667,0.06666666666666665,0.0,0.5333333333333333],"cost_curve":[0.046608,0.061044,0.103014,0.129216,0.099518,0.090966,0.05285,0.069292,0.074236,0.082108,0.139648,0.033232,0.10968,0.063388,0.036866,0.03295,0.01286,0.013024,0.026594,0.047232,0.018774,0.026054,0.037912,0.035798,0.016568,0.04518,0.069918,0.01033,0.030002,0.04294,0.075522,0.02998,0.046952,0.049328,0.06299,0.087208,0.0256,0.073726,0.0762,0.069726]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","run_index":1,"reward":8.2,"baseline_reward":4.2,"reference_reward":40.0,"gain":3.999999999999999,"normalized_reward":0.07736943907156674,"normalized_gain":0.11173184357541897,"cost_usd":2.5744406,"latency_seconds":4.695875,"instance_count":40,"reward_curve":[0.19999999999999996,0.2666666666666667,0.6666666666666667,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.4666666666666667,0.0,0.0,0.2666666666666667,0.4,0.6,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.4,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.4666666666666667,0.8666666666666667,0.5333333333333333,0.5333333333333333],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.0,0.6666666666666667,0.0,0.33333333333333337,0.1333333333333333,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.33333333333333337,0.4,0.0],"gain_curve":[-0.2666666666666667,0.2666666666666667,0.6666666666666667,0.0,0.0,-0.19999999999999996,-0.4666666666666667,0.4666666666666667,-0.6666666666666667,0.4666666666666667,-0.33333333333333337,-0.1333333333333333,0.2666666666666667,-0.1333333333333333,0.6,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.4,0.0,0.0,0.33333333333333337,0.0,0.0,-0.19999999999999996,0.0,0.5333333333333333,0.1333333333333333,0.5333333333333333],"cost_curve":[0.092716,0.090406,0.037714,0.05896,0.132194,0.1419126,0.032534,0.054142,0.063048,0.082782,0.032986,0.030682,0.126668,0.069054,0.04671,0.045248,0.031486,0.056604,0.133998,0.02929,0.122364,0.070312,0.049118,0.054754,0.043548,0.057702,0.035966,0.060114,0.083414,0.087928,0.03663,0.044346,0.083234,0.037904,0.04049,0.100028,0.070604,0.011182,0.049958,0.04571]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","run_index":2,"reward":10.26666666666667,"baseline_reward":4.2,"reference_reward":40.0,"gain":6.066666666666669,"normalized_reward":0.13733075435203104,"normalized_gain":0.1694599627560522,"cost_usd":2.345984,"latency_seconds":4.603031,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.0,0.6666666666666667,0.33333333333333337,0.5333333333333333,0.7333333333333334,0.6666666666666667,0.0,0.0,0.6666666666666667,0.0,0.0,0.7333333333333334,0.4666666666666667,0.6666666666666667,0.0,0.0,0.4,0.4666666666666667,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.4,0.0,0.6666666666666667,0.0,0.0,0.5333333333333333,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.8],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.0,0.6666666666666667,0.0,0.33333333333333337,0.1333333333333333,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.33333333333333337,0.4,0.0],"gain_curve":[-0.4666666666666667,0.0,0.0,0.0,0.6666666666666667,0.13333333333333341,0.06666666666666665,0.7333333333333334,0.0,0.0,-0.33333333333333337,0.5333333333333334,0.0,-0.5333333333333333,0.7333333333333334,0.4666666666666667,0.6666666666666667,0.0,0.0,0.4,0.4666666666666667,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.4,0.0,0.6666666666666667,0.0,0.0,0.5333333333333333,0.0,0.0,0.33333333333333337,-0.4666666666666667,0.19999999999999996,-0.4,0.8],"cost_curve":[0.03562,0.14071,0.081962,0.044412,0.034068,0.071738,0.060094,0.044112,0.039094,0.0458,0.168848,0.039056,0.04894,0.084768,0.02032,0.069266,0.030608,0.03977,0.033284,0.065778,0.060424,0.078754,0.055112,0.085314,0.032224,0.029988,0.16641,0.07921,0.054168,0.034914,0.075124,0.06482,0.048678,0.0323,0.06103,0.058244,0.03049,0.045342,0.03913,0.01606]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","run_index":3,"reward":7.066666666666667,"baseline_reward":4.2,"reference_reward":40.0,"gain":2.866666666666667,"normalized_reward":0.04448742746615092,"normalized_gain":0.08007448789571696,"cost_usd":2.912162,"latency_seconds":4.410652,"instance_count":40,"reward_curve":[0.0,0.19999999999999996,0.33333333333333337,0.0,0.5333333333333333,0.0,0.06666666666666665,0.0,0.0,0.0,0.5333333333333333,0.5333333333333333,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.4,0.19999999999999996,0.8,0.6,0.0,0.0,0.4,0.19999999999999996,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.5333333333333333,0.6666666666666667,0.0],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.0,0.6666666666666667,0.0,0.33333333333333337,0.1333333333333333,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.33333333333333337,0.4,0.0],"gain_curve":[-0.4666666666666667,0.19999999999999996,0.33333333333333337,0.0,0.5333333333333333,-0.19999999999999996,-0.4,0.0,-0.6666666666666667,0.0,0.19999999999999996,0.4,0.0,-0.5333333333333333,0.0,0.33333333333333337,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.4,0.19999999999999996,0.8,0.6,0.0,0.0,0.4,0.19999999999999996,0.0,0.0,0.0,0.0,0.0,0.20000000000000007,-0.4666666666666667,0.19999999999999996,0.2666666666666667,0.0],"cost_curve":[0.104298,0.097302,0.083836,0.028904,0.07625,0.143436,0.144678,0.064054,0.079692,0.056986,0.050036,0.05179,0.048334,0.055588,0.11183,0.115144,0.125668,0.04297,0.059032,0.146064,0.08168,0.073558,0.0608,0.12657,0.0173,0.037444,0.045578,0.065306,0.086268,0.138742,0.102654,0.028956,0.056048,0.036632,0.021612,0.08507,0.037102,0.049446,0.031318,0.044186]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"database_exploration","run_index":4,"reward":7.266666666666666,"baseline_reward":4.2,"reference_reward":40.0,"gain":3.0666666666666655,"normalized_reward":0.05029013539651837,"normalized_gain":0.08566108007448787,"cost_usd":3.0737244,"latency_seconds":4.32873,"instance_count":40,"reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.4666666666666667,0.7333333333333334,0.4,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.4666666666666667,0.4666666666666667,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.0],"baseline_reward_curve":[0.4666666666666667,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.0,0.6666666666666667,0.0,0.33333333333333337,0.1333333333333333,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.4666666666666667,0.33333333333333337,0.4,0.0],"gain_curve":[0.0,0.0,0.0,0.0,0.0,0.2666666666666667,0.2666666666666667,0.4,-0.6666666666666667,0.6666666666666667,0.06666666666666665,-0.1333333333333333,0.0,-0.5333333333333333,0.0,0.0,0.4,0.0,0.0,0.5333333333333333,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.4666666666666667,0.4666666666666667,0.0,0.8666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,-0.4666666666666667,0.19999999999999996,-0.4,0.0],"cost_curve":[0.052516,0.126278,0.05674,0.135005,0.031784,0.068302,0.03275,0.062988,0.133206,0.039392,0.064322,0.034402,0.051236,0.024868,0.06796,0.065206,0.064248,0.037474,0.085072,0.053326,0.10577,0.093374,0.047766,0.158834,0.098,0.045058,0.051978,0.070376,0.0583,0.065884,0.012708,0.113218,0.104044,0.2108894,0.18045,0.086334,0.061404,0.04632,0.077632,0.09831]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":0,"reward":75.5,"baseline_reward":36.5,"reference_reward":1138.5,"gain":39.0,"normalized_reward":-0.05781669817892327,"normalized_gain":0.03539019963702359,"cost_usd":1.27965,"latency_seconds":5.413177,"instance_count":120,"reward_curve":[-1.0,-0.5,-1.0,6.0,-1.0,1.0,3.0,-1.0,21.0,-6.0,1.0,-2.0,5.0,4.0,-3.0,-0.5,4.0,-2.0,5.0,20.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,1.0,-4.0,0.5,0.5,-2.0,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,1.0,2.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,9.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,1.0,0.5,16.0,2.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-0.5,-0.5,-1.0,0.5,0.5,1.0,-0.5,4.0,3.0,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,0.5,-18.0,-1.0,-1.0,-1.0,-0.5,-1.0,-2.0,-0.5,-1.0,17.0,1.0,0.5,-1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,3.0,-0.5,1.0,3.0,-1.0,13.0,-2.5,1.0,-2.0,2.5,1.0,-1.0,-0.5,2.0,-1.0,4.0,8.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,-1.0,-2.0,0.5,0.5,-0.5,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,3.0,0.5,-0.5,-1.0,-1.0,-1.0,-1.0,0.5,-11.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,17.0,2.0,0.5,-1.0],"gain_curve":[0.0,0.0,0.0,3.0,-0.5,0.0,0.0,0.0,8.0,-3.5,0.0,0.0,2.5,3.0,-2.0,0.0,2.0,-1.0,1.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,-2.0,0.0,0.0,-1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,11.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,-7.0,0.0,0.0,0.0,0.0,-3.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0],"cost_curve":[0.024432,0.008566,0.018732,0.02677,0.023392,0.015644,0.021606,0.01921,0.02319,0.023734,0.019954,0.02707,0.026294,0.020172,0.019906,0.00588,0.020804,0.022926,0.023344,0.021744,0.004758,0.00437,0.00587,0.004458,0.006172,0.005622,0.0,0.005586,0.0,0.00572,0.011814,0.0,0.00739,0.025384,0.0,0.0,0.009904,0.00752,0.010316,0.027216,0.01143,0.003036,0.003714,0.005644,0.005094,0.012634,0.02429,0.003706,0.004786,0.003382,0.006644,0.016946,0.019138,0.020376,0.01734,0.018216,0.006028,0.017632,0.017564,0.017538,0.004866,0.0,0.0,0.00654,0.00294,0.004268,0.003244,0.004462,0.012044,0.005822,0.003302,0.018786,0.0,0.006464,0.016474,0.013052,0.005992,0.0,0.005794,0.01746,0.005444,0.0,0.019136,0.010578,0.008352,0.00461,0.00433,0.016554,0.020064,0.003514,0.0,0.004756,0.003068,0.014652,0.0,0.0,0.01884,0.004842,0.013438,0.00919,0.0,0.01238,0.011142,0.015732,0.009242,0.003576,0.0,0.036118,0.003668,0.01032,0.003904,0.003574,0.009096,0.0181,0.00525,0.00342,0.038624,0.008218,0.0,0.0038]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":1,"reward":41.0,"baseline_reward":36.5,"reference_reward":1138.5,"gain":4.5,"normalized_reward":-0.09214847248482436,"normalized_gain":0.004083484573502722,"cost_usd":1.34893,"latency_seconds":5.33424,"instance_count":120,"reward_curve":[13.5,1.0,1.0,-1.0,1.0,1.0,-2.0,-1.0,2.0,2.5,-1.0,18.0,2.0,-0.5,1.0,-2.0,-1.0,-0.5,-1.0,-0.5,-1.0,-0.5,-0.5,1.0,-0.5,0.5,10.0,-1.0,0.0,0.5,-1.0,-0.5,0.5,-0.5,-0.5,0.5,-1.0,-0.5,-0.5,0.5,-1.0,-0.5,-2.0,-0.5,-0.5,10.0,-0.5,-1.0,-0.5,-1.0,-1.0,2.0,-1.0,-8.0,-4.0,1.0,-0.5,1.0,2.0,-1.0,5.0,-0.5,-1.0,2.0,-1.0,-0.5,0.5,0.5,-0.5,3.0,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,1.0,-0.5,-0.5,0.5,0.5,2.0,0.5,-0.5,-1.0,6.0,-1.0,-0.5,1.0,0.5,-1.0,1.0,-0.5,-0.5,-1.0,0.5,-0.5,-1.0,-1.0,-23.0,-0.5,-0.5,0.5,-1.0,2.0,-0.5,-1.0,0.5,2.0,-1.0,-1.0,17.0,-0.5,4.0,0.5,3.0,1.0,-3.0,1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,3.0,-0.5,1.0,3.0,-1.0,13.0,-2.5,1.0,-2.0,2.5,1.0,-1.0,-0.5,2.0,-1.0,4.0,8.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,-1.0,-2.0,0.5,0.5,-0.5,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,3.0,0.5,-0.5,-1.0,-1.0,-1.0,-1.0,0.5,-11.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,17.0,2.0,0.5,-1.0],"gain_curve":[14.5,1.5,2.0,-4.0,1.5,0.0,-5.0,0.0,-11.0,5.0,-2.0,20.0,-0.5,-1.5,2.0,-1.5,-3.0,0.5,-5.0,-8.5,0.0,0.5,0.0,1.5,0.0,1.0,9.5,-0.5,-0.5,1.0,-1.0,-1.0,1.5,1.5,-1.0,0.0,-0.5,0.0,-1.5,-9.5,0.0,0.5,-1.5,0.0,0.0,12.5,-10.5,-0.5,0.0,-0.5,-0.5,1.0,5.0,0.0,-6.0,0.0,0.0,2.0,3.0,0.0,5.5,-1.0,-1.5,2.5,-0.5,-1.5,1.0,1.0,-1.5,3.5,1.0,-6.5,-1.0,0.0,0.5,-1.5,1.5,0.5,0.0,-2.5,1.5,0.0,-3.0,1.5,-3.5,-0.5,5.0,-3.0,-1.5,1.5,0.0,0.0,1.5,0.5,-1.0,-1.5,-0.5,0.0,0.0,-4.0,-23.5,0.0,0.5,1.5,0.0,3.0,-1.0,10.0,1.5,3.0,0.0,-0.5,15.0,2.5,4.5,1.5,-14.0,-1.0,-3.5,2.0],"cost_curve":[0.023248,0.020948,0.01918,0.017656,0.018614,0.020404,0.020192,0.018372,0.021726,0.020854,0.01861,0.021542,0.01957,0.00415,0.021236,0.024536,0.018714,0.003568,0.021052,0.003604,0.00508,0.005314,0.006034,0.016206,0.003394,0.0,0.02772,0.013468,0.01938,0.0,0.004342,0.003696,0.0,0.00541,0.003644,0.0,0.010972,0.007244,0.003118,0.0,0.011742,0.00479,0.01381,0.00429,0.0057,0.027404,0.003996,0.005088,0.004552,0.009596,0.017634,0.020814,0.018034,0.023222,0.01871,0.017962,0.004478,0.015598,0.026452,0.01732,0.022458,0.005324,0.017612,0.019498,0.019906,0.005768,0.0,0.0,0.004498,0.00845,0.0,0.00405,0.004176,0.003778,0.00595,0.003224,0.003718,0.014292,0.004754,0.006084,0.0,0.0,0.019446,0.0,0.004546,0.020392,0.02368,0.017462,0.00494,0.022332,0.0,0.018992,0.006074,0.006118,0.005962,0.004354,0.0,0.006004,0.009396,0.008744,0.03587,0.00619,0.007192,0.0,0.00486,0.017216,0.00627,0.017488,0.0,0.02393,0.004826,0.005086,0.041024,0.005744,0.01789,0.0,0.008854,0.016272,0.02761,0.006636]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":2,"reward":62.5,"baseline_reward":36.5,"reference_reward":1138.5,"gain":26.0,"normalized_reward":-0.07075330878694397,"normalized_gain":0.023593466424682397,"cost_usd":1.326166,"latency_seconds":5.202446,"instance_count":120,"reward_curve":[12.5,-1.0,-0.5,2.0,-1.0,2.0,-1.0,-2.0,-2.0,-0.5,20.0,-1.0,-1.0,2.0,3.0,1.0,3.0,-2.0,4.0,1.0,0.0,-0.5,1.0,-2.0,-0.5,0.5,-1.0,-2.0,0.5,0.5,-1.0,-0.5,-0.5,-1.0,-1.0,0.5,-0.5,-0.5,-0.5,-0.5,-0.5,10.0,-0.5,10.0,-0.5,0.5,0.0,-1.0,-0.5,-0.5,1.0,1.0,-6.0,-1.0,-1.0,-0.5,2.5,-1.0,1.0,-6.0,0.5,2.0,0.5,-0.5,0.5,1.0,-0.5,-0.5,-0.5,-0.5,1.0,4.5,-0.5,-0.5,-0.5,2.5,1.0,1.0,-1.0,-0.5,1.0,1.0,0.5,1.0,6.0,0.5,0.5,-0.5,-1.0,-1.0,-0.5,-0.5,-0.5,0.5,2.0,-1.0,1.0,-1.0,2.0,-1.0,0.5,-3.0,-11.0,1.0,-1.0,-0.5,-1.0,0.5,-1.0,18.0,2.0,-1.0,-0.5,-1.0,-0.5,0.5,-1.0,0.5,2.5,4.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,3.0,-0.5,1.0,3.0,-1.0,13.0,-2.5,1.0,-2.0,2.5,1.0,-1.0,-0.5,2.0,-1.0,4.0,8.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,-1.0,-2.0,0.5,0.5,-0.5,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,3.0,0.5,-0.5,-1.0,-1.0,-1.0,-1.0,0.5,-11.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,17.0,2.0,0.5,-1.0],"gain_curve":[13.5,-0.5,0.5,-1.0,-0.5,1.0,-4.0,-1.0,-15.0,2.0,19.0,1.0,-3.5,1.0,4.0,1.5,1.0,-1.0,0.0,-7.0,1.0,0.5,1.5,-1.5,0.0,1.0,-1.5,-1.5,0.0,1.0,-1.0,-1.0,0.5,1.0,-1.5,0.0,0.0,0.0,-1.5,-10.5,0.5,11.0,0.0,10.5,0.0,3.0,-10.0,-0.5,0.0,0.0,1.5,0.0,0.0,7.0,-3.0,-1.5,3.0,0.0,2.0,-5.0,1.0,1.5,0.0,0.0,1.0,0.0,0.0,0.0,-1.5,0.0,1.5,-1.5,-1.0,0.0,0.5,1.5,1.5,0.5,-0.5,-2.5,2.0,0.5,-4.5,2.0,3.0,1.0,-0.5,-2.5,-2.0,-0.5,-1.0,0.5,0.0,1.5,1.5,-1.5,0.0,-0.5,3.0,-4.0,0.0,-2.5,-10.0,2.0,0.0,0.5,-1.5,11.5,0.0,19.0,3.0,-0.5,-2.5,2.0,0.0,1.5,-18.0,-1.5,2.0,5.0],"cost_curve":[0.029036,0.016548,0.003716,0.019192,0.017454,0.021148,0.018666,0.02162,0.024234,0.004112,0.02215,0.022788,0.025242,0.019412,0.021498,0.016986,0.024678,0.02106,0.020436,0.020094,0.020942,0.003194,0.008318,0.012276,0.002992,0.0,0.017916,0.02063,0.0,0.0,0.003442,0.00605,0.004368,0.004148,0.011112,0.0,0.006042,0.004848,0.00654,0.003914,0.005868,0.025228,0.003722,0.03005,0.005916,0.0,0.01766,0.003574,0.005538,0.00489,0.01815,0.015424,0.020078,0.017578,0.016454,0.00563,0.021368,0.018012,0.023088,0.020318,0.0,0.017296,0.0,0.005768,0.0,0.006364,0.004788,0.005512,0.004006,0.00541,0.007268,0.019714,0.006368,0.004844,0.005108,0.010448,0.008848,0.004546,0.016518,0.005168,0.005776,0.008062,0.0,0.006574,0.020734,0.0,0.0,0.006282,0.016944,0.020256,0.006446,0.00624,0.005788,0.0,0.010484,0.01158,0.01535,0.004572,0.023074,0.004884,0.0,0.02768,0.035452,0.006694,0.003912,0.007396,0.005768,0.0,0.010964,0.026106,0.01249,0.014304,0.006038,0.00921,0.005552,0.0,0.004012,0.0,0.01064,0.02358]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":3,"reward":43.5,"baseline_reward":36.5,"reference_reward":1138.5,"gain":7.0,"normalized_reward":-0.08966066275251268,"normalized_gain":0.006352087114337568,"cost_usd":1.39962,"latency_seconds":5.511306,"instance_count":120,"reward_curve":[2.0,3.0,2.5,2.5,-5.0,-5.0,3.0,-3.0,-4.0,5.0,-1.0,-0.5,-1.0,1.0,-1.0,-1.0,20.0,-0.5,1.0,22.0,-0.5,-0.5,-1.0,-0.5,0.0,-1.0,-1.0,-0.5,-0.5,-1.0,-1.0,0.5,0.5,-1.0,0.5,-0.5,-0.5,-1.0,-0.5,10.0,10.0,-0.5,-2.0,-0.5,0.5,-0.5,-0.5,-2.0,0.5,-0.5,-8.0,-1.0,1.0,2.0,-0.5,1.0,-6.0,-1.0,2.5,-1.0,1.0,3.0,0.5,12.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-2.0,4.0,0.5,-1.0,1.0,-0.5,-0.5,-0.5,0.5,-0.5,4.5,0.5,0.5,-1.0,-1.0,-0.5,1.0,1.0,0.5,-0.5,-0.5,-0.5,2.5,-0.5,1.0,-17.0,16.0,-1.0,0.5,0.5,-1.0,2.0,4.0,0.5,1.0,2.0,3.0,-1.0,-6.0,-0.5,0.5,-1.0,-0.5,-0.5,-9.0,-1.0,-1.0,-1.0,-1.0,-1.0],"baseline_reward_curve":[-1.0,-0.5,-1.0,3.0,-0.5,1.0,3.0,-1.0,13.0,-2.5,1.0,-2.0,2.5,1.0,-1.0,-0.5,2.0,-1.0,4.0,8.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,-1.0,-2.0,0.5,0.5,-0.5,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,3.0,0.5,-0.5,-1.0,-1.0,-1.0,-1.0,0.5,-11.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,17.0,2.0,0.5,-1.0],"gain_curve":[3.0,3.5,3.5,-0.5,-4.5,-6.0,0.0,-2.0,-17.0,7.5,-2.0,1.5,-3.5,0.0,0.0,-0.5,18.0,0.5,-3.0,14.0,0.5,0.5,-0.5,0.0,0.5,-0.5,-1.5,0.0,-1.0,-0.5,-1.0,0.0,1.5,1.0,0.0,-1.0,0.0,-0.5,-1.5,0.0,11.0,0.5,-1.5,0.0,1.0,2.0,-10.5,-1.5,1.0,0.0,-7.5,-2.0,7.0,10.0,-2.5,0.0,-5.5,0.0,3.5,0.0,1.5,2.5,0.0,12.5,-0.5,-1.5,0.0,0.0,-1.5,1.0,-1.5,-2.0,0.0,-0.5,2.0,-1.5,0.0,-1.0,1.0,-2.5,5.5,0.0,-4.5,0.0,-4.0,0.0,0.0,-1.0,-0.5,0.0,-1.0,0.5,3.0,0.5,0.5,-17.5,15.0,-0.5,1.5,-2.5,-1.5,2.5,5.0,1.5,2.0,3.0,2.5,10.0,-5.0,0.5,1.5,-0.5,-2.5,2.5,-8.5,0.0,-18.0,-3.0,-1.5,0.0],"cost_curve":[0.027544,0.020752,0.021934,0.022768,0.02824,0.023446,0.02261,0.022902,0.026674,0.024126,0.016424,0.004664,0.02211,0.017108,0.02258,0.01771,0.02296,0.007402,0.017604,0.025018,0.004834,0.002902,0.011348,0.004754,0.020776,0.003676,0.003308,0.0049,0.004708,0.012298,0.015026,0.0,0.0,0.021968,0.0,0.005716,0.005908,0.003622,0.003728,0.024364,0.028858,0.006792,0.017598,0.003472,0.0,0.00347,0.003542,0.01489,0.0,0.006256,0.019646,0.01675,0.018614,0.023144,0.004962,0.016968,0.022746,0.017066,0.020776,0.016368,0.011904,0.011492,0.0,0.024904,0.018478,0.003114,0.002972,0.004126,0.005554,0.0,0.02072,0.021564,0.0,0.023676,0.011158,0.005878,0.003258,0.003344,0.0,0.003286,0.017952,0.0,0.0,0.021308,0.016258,0.00349,0.018278,0.004712,0.0,0.0058,0.006052,0.004058,0.019036,0.005368,0.014262,0.041194,0.02534,0.005814,0.0,0.0,0.003906,0.012704,0.011784,0.0,0.009968,0.013688,0.008566,0.010062,0.025724,0.00341,0.0,0.003678,0.005732,0.005734,0.024638,0.015776,0.003636,0.011898,0.00396,0.010076]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"exploitable_poker","run_index":4,"reward":45.0,"baseline_reward":36.5,"reference_reward":1138.5,"gain":8.5,"normalized_reward":-0.08816797691312568,"normalized_gain":0.007713248638838476,"cost_usd":1.276378,"latency_seconds":5.294263,"instance_count":120,"reward_curve":[1.0,-0.5,1.0,20.0,-1.0,4.0,1.0,4.0,1.0,-1.0,-2.5,-1.0,2.0,25.0,-5.0,-0.5,-1.0,-0.5,4.0,-1.0,-0.5,0.5,-6.0,-1.0,-0.5,0.0,-1.0,-1.0,-1.0,10.0,-2.0,-0.5,0.5,-0.5,-0.5,-2.0,-0.5,-0.5,-1.0,-0.5,0.5,-0.5,-0.5,0.5,0.5,-0.5,-1.0,-0.5,-0.5,-0.5,-1.0,-8.0,-8.0,-1.0,1.0,1.0,-1.0,2.5,1.0,-0.5,1.0,3.0,1.0,0.5,-0.5,-0.5,-0.5,1.0,1.0,0.5,-5.0,3.0,-0.5,-0.5,4.0,-0.5,-0.5,0.5,0.5,-0.5,0.5,-0.5,1.0,-1.0,-1.0,0.5,3.0,-0.5,6.0,1.0,0.5,2.5,-0.5,-0.5,-0.5,1.0,-1.0,-0.5,-1.0,-0.5,0.5,-1.0,2.0,-1.0,-1.0,-1.0,19.0,0.5,0.5,-0.5,-1.0,-1.0,2.0,1.0,0.5,-1.0,-1.0,-11.0,-1.0,-0.5],"baseline_reward_curve":[-1.0,-0.5,-1.0,3.0,-0.5,1.0,3.0,-1.0,13.0,-2.5,1.0,-2.0,2.5,1.0,-1.0,-0.5,2.0,-1.0,4.0,8.0,-1.0,-1.0,-0.5,-0.5,-0.5,-0.5,0.5,-0.5,0.5,-0.5,0.0,0.5,-1.0,-2.0,0.5,0.5,-0.5,-0.5,1.0,10.0,-1.0,-1.0,-0.5,-0.5,-0.5,-2.5,10.0,-0.5,-0.5,-0.5,-0.5,1.0,-6.0,-8.0,2.0,1.0,-0.5,-1.0,-1.0,-1.0,-0.5,0.5,0.5,-0.5,-0.5,1.0,-0.5,-0.5,1.0,-0.5,-0.5,6.0,0.5,-0.5,-1.0,1.0,-0.5,0.5,-0.5,2.0,-1.0,0.5,5.0,-1.0,3.0,-0.5,1.0,2.0,1.0,-0.5,0.5,-1.0,-0.5,-1.0,0.5,0.5,1.0,-0.5,-1.0,3.0,0.5,-0.5,-1.0,-1.0,-1.0,-1.0,0.5,-11.0,-1.0,-1.0,-1.0,-0.5,2.0,-3.0,-0.5,-1.0,17.0,2.0,0.5,-1.0],"gain_curve":[2.0,0.0,2.0,17.0,-0.5,3.0,-2.0,5.0,-12.0,1.5,-3.5,1.0,-0.5,24.0,-4.0,0.0,-3.0,0.5,0.0,-9.0,0.5,1.5,-5.5,-0.5,0.0,0.5,-1.5,-0.5,-1.5,10.5,-2.0,-1.0,1.5,1.5,-1.0,-2.5,0.0,0.0,-2.0,-10.5,1.5,0.5,0.0,1.0,1.0,2.0,-11.0,0.0,0.0,0.0,-0.5,-9.0,-2.0,7.0,-1.0,0.0,-0.5,3.5,2.0,0.5,1.5,2.5,0.5,1.0,0.0,-1.5,0.0,1.5,0.0,1.0,-4.5,-3.0,-1.0,0.0,5.0,-1.5,0.0,0.0,1.0,-2.5,1.5,-1.0,-4.0,0.0,-4.0,1.0,2.0,-2.5,5.0,1.5,0.0,3.5,0.0,0.5,-1.0,0.5,-2.0,0.0,0.0,-3.5,0.0,-0.5,3.0,0.0,0.0,0.0,18.5,11.5,1.5,0.5,0.0,-0.5,0.0,4.0,1.0,0.0,-18.0,-13.0,-1.5,0.5],"cost_curve":[0.019986,0.007068,0.022014,0.025936,0.018684,0.01948,0.019444,0.023092,0.015996,0.021338,0.028258,0.017966,0.022466,0.021906,0.024672,0.00526,0.0196,0.00454,0.021186,0.01615,0.00602,0.0,0.009542,0.01967,0.005152,0.01782,0.003666,0.003378,0.003644,0.031056,0.014408,0.00303,0.0,0.00509,0.006458,0.017646,0.006222,0.005068,0.010176,0.005516,0.0,0.005222,0.005966,0.0,0.0,0.005906,0.012066,0.006016,0.006148,0.004934,0.01754,0.020684,0.020962,0.016582,0.019092,0.017802,0.016348,0.020854,0.022674,0.006906,0.023012,0.011926,0.009974,0.0,0.004962,0.006154,0.002984,0.011548,0.006206,0.0,0.023492,0.010456,0.005738,0.005878,0.019684,0.003026,0.004896,0.0,0.0,0.00396,0.0,0.004586,0.00433,0.018914,0.019156,0.0,0.00963,0.003376,0.020972,0.006476,0.0,0.010876,0.00467,0.003022,0.005652,0.014186,0.008476,0.006378,0.008748,0.003826,0.0,0.00992,0.015394,0.003444,0.005698,0.003608,0.03978,0.0,0.0,0.006758,0.003812,0.016078,0.027278,0.008384,0.0,0.003876,0.008364,0.035506,0.00382,0.003182]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","run_index":0,"reward":8.6193,"baseline_reward":3.1562,"reference_reward":12.0,"gain":5.463100000000001,"normalized_reward":0.4719883799022289,"normalized_gain":0.6177321965670867,"cost_usd":1.2769758,"latency_seconds":11.368048,"instance_count":12,"reward_curve":[0.2379,0.5581,0.763,0.7526,0.8032,0.7698,0.7255,0.7858,0.7822,0.7783,0.9022,0.7607],"baseline_reward_curve":[0.5952,0.0,0.2346,0.4659,0.0,0.0,0.1563,0.3341,0.2716,0.4693,0.3333,0.2959],"gain_curve":[-0.35729999999999995,0.5581,0.5284,0.28670000000000007,0.8032,0.7698,0.5692,0.45170000000000005,0.5105999999999999,0.309,0.5689,0.46480000000000005],"cost_curve":[0.084678,0.111976,0.078988,0.092336,0.078778,0.087612,0.2199718,0.087254,0.135188,0.079652,0.111822,0.10872]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","run_index":1,"reward":8.379999999999999,"baseline_reward":3.1562,"reference_reward":12.0,"gain":5.223799999999999,"normalized_reward":0.4346135224202288,"normalized_gain":0.590673692304213,"cost_usd":1.540095,"latency_seconds":9.077323,"instance_count":12,"reward_curve":[0.5321,0.6588,0.572,0.5621,0.7789,0.5475,0.8037,0.8877,0.8438,0.7155,0.7173,0.7606],"baseline_reward_curve":[0.5952,0.0,0.2346,0.4659,0.0,0.0,0.1563,0.3341,0.2716,0.4693,0.3333,0.2959],"gain_curve":[-0.06309999999999993,0.6588,0.3373999999999999,0.09620000000000006,0.7789,0.5475,0.6474,0.5536000000000001,0.5722,0.24620000000000003,0.38400000000000006,0.46470000000000006],"cost_curve":[0.094078,0.13906,0.160224,0.136644,0.096448,0.129272,0.093532,0.150236,0.13898,0.088882,0.218369,0.09437]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","run_index":2,"reward":6.9341,"baseline_reward":3.1562,"reference_reward":12.0,"gain":3.7779,"normalized_reward":0.20878691801896077,"normalized_gain":0.4271806237137882,"cost_usd":1.294486,"latency_seconds":8.573476,"instance_count":12,"reward_curve":[0.3484,0.5581,0.572,0.7066,0.6503,0.5909,0.583,0.5965,0.58,0.5781,0.5844,0.5858],"baseline_reward_curve":[0.5952,0.0,0.2346,0.4659,0.0,0.0,0.1563,0.3341,0.2716,0.4693,0.3333,0.2959],"gain_curve":[-0.24679999999999996,0.5581,0.3373999999999999,0.24070000000000003,0.6503,0.5909,0.42669999999999997,0.2624,0.30839999999999995,0.10879999999999995,0.25110000000000005,0.2899],"cost_curve":[0.095018,0.057732,0.082864,0.129498,0.072884,0.110328,0.121862,0.118404,0.13722,0.125534,0.12582,0.117322]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","run_index":3,"reward":7.7303,"baseline_reward":3.1562,"reference_reward":12.0,"gain":4.5741,"normalized_reward":0.3331407062645446,"normalized_gain":0.5172097966937289,"cost_usd":1.6553214,"latency_seconds":8.445999,"instance_count":12,"reward_curve":[0.5499,0.5581,0.572,0.7617,0.6095,0.6167,0.6284,0.6548,0.6435,0.6371,0.819,0.6796],"baseline_reward_curve":[0.5952,0.0,0.2346,0.4659,0.0,0.0,0.1563,0.3341,0.2716,0.4693,0.3333,0.2959],"gain_curve":[-0.045299999999999896,0.5581,0.3373999999999999,0.29580000000000006,0.6095,0.6167,0.47209999999999996,0.32070000000000004,0.37189999999999995,0.1678,0.48569999999999997,0.3837],"cost_curve":[0.127562,0.170612,0.120956,0.096222,0.2877096,0.07184,0.073012,0.101646,0.2072386,0.2260372,0.09216,0.080326]},{"run_name":"icl-notepad-gemini-3.1-pro-preview","task":"sales_prediction","run_index":4,"reward":8.876100000000001,"baseline_reward":3.1562,"reference_reward":12.0,"gain":5.719900000000001,"normalized_reward":0.5120964593062304,"normalized_gain":0.6467694882290419,"cost_usd":1.8603184,"latency_seconds":8.093115,"instance_count":12,"reward_curve":[0.437,0.5581,0.8539,0.8204,0.7816,0.7459,0.7246,0.7205,0.7484,0.7313,0.9549,0.7995],"baseline_reward_curve":[0.5952,0.0,0.2346,0.4659,0.0,0.0,0.1563,0.3341,0.2716,0.4693,0.3333,0.2959],"gain_curve":[-0.15819999999999995,0.5581,0.6193,0.35450000000000004,0.7816,0.7459,0.5683,0.3864,0.47679999999999995,0.26199999999999996,0.6215999999999999,0.5036],"cost_curve":[0.135186,0.062706,0.102646,0.1379264,0.121564,0.097496,0.2638152,0.262176,0.1534802,0.1452074,0.2576612,0.120454]},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","run_index":0,"reward":35.235300000000045,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain":15.473900000000043,"normalized_reward":0.22030922991500507,"normalized_gain":0.2203047896740545,"cost_usd":1.0708825,"latency_seconds":7.454755,"instance_count":90,"reward_curve":[0.2203,0.297,0.3383,0.3448,0.3012,0.3012,0.3245,0.3093,0.3093,0.3093,0.3093,0.3093,0.3093,0.3876,0.3771,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065,0.4065],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2346,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.04879999999999998,0.1266,0.1184,0.07710000000000003,0.08840000000000003,0.09720000000000001,0.11430000000000001,0.08720000000000003,0.09670000000000001,0.06890000000000002,0.08080000000000001,0.09000000000000002,0.1393,0.1851,0.20909999999999998,0.18259999999999998,0.17949999999999997,0.19999999999999998,0.15909999999999996,0.20469999999999997,0.20459999999999998,0.19349999999999998,0.19819999999999996,0.18209999999999998,0.17319999999999997,0.19709999999999997,0.19599999999999998,0.17529999999999998,0.19929999999999998,0.20829999999999999,0.19799999999999998,0.19699999999999998,0.20379999999999998,0.18299999999999997,0.19259999999999997,0.20359999999999998,0.16509999999999997,0.20919999999999997,0.18619999999999998,0.18009999999999998,0.21389999999999998,0.16679999999999998,0.18489999999999998,0.17919999999999997,0.17909999999999998,0.18499999999999997,0.17559999999999998,0.17189999999999997,0.17779999999999999,0.18879999999999997,0.18499999999999997,0.19899999999999998,0.19379999999999997,0.18189999999999998,0.18129999999999996,0.20669999999999997,0.17039999999999997,0.21099999999999997,0.19089999999999996,0.16459999999999997,0.19509999999999997,0.18989999999999999,0.18549999999999997,0.20839999999999997,0.19099999999999998,0.17929999999999996,0.1513,0.19769999999999996,0.18529999999999996,0.15239999999999998,0.19259999999999997,0.15929999999999997,0.17619999999999997,0.18569999999999998,0.16879999999999998,0.16429999999999997,0.19359999999999997,0.15769999999999998,0.20679999999999998,0.19859999999999997,0.18889999999999998,0.18989999999999999,0.19639999999999996,0.18719999999999998,0.20609999999999998,0.20689999999999997,0.20479999999999998,0.16229999999999997,0.18429999999999996],"cost_curve":[0.0042725,0.00724,0.0083875,0.01052,0.0096375,0.00934,0.0118325,0.009505,0.01069,0.010765,0.0108875,0.0103275,0.009945,0.01212,0.013595,0.0137525,0.0131025,0.0132025,0.0119975,0.012185,0.01158,0.012015,0.011705,0.0120825,0.01293,0.012405,0.012445,0.011685,0.01227,0.0129675,0.0135725,0.0117025,0.01183,0.0118975,0.011845,0.011855,0.0115575,0.0118575,0.0120275,0.0123375,0.01243,0.0120525,0.01211,0.0123325,0.0128025,0.01217,0.012545,0.0128975,0.012515,0.01133,0.0114475,0.01167,0.0113375,0.0115225,0.01239,0.0132825,0.0122675,0.01189,0.0118175,0.01228,0.01313,0.012305,0.0122975,0.012255,0.01164,0.0121475,0.012095,0.01208,0.0121275,0.0124225,0.0123725,0.0134125,0.013375,0.0131925,0.0126975,0.012825,0.01222,0.01189,0.012365,0.012215,0.0123875,0.0123875,0.01223,0.0113625,0.0125925,0.01179,0.01287,0.011805,0.0123075,0.0131225]},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","run_index":1,"reward":34.24510000000004,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain":14.483700000000042,"normalized_reward":0.20621164879910078,"normalized_gain":0.20620712827419743,"cost_usd":1.19687,"latency_seconds":8.232888,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2494,0.2677,0.2677,0.2677,0.2677,0.317,0.317,0.3282,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3965,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.3768,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017,0.4017],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2346,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03770000000000001,0.0413,0.0436,0.054900000000000004,0.04039999999999999,0.122,0.09490000000000001,0.11559999999999998,0.15610000000000002,0.168,0.17720000000000002,0.14820000000000003,0.20450000000000002,0.19910000000000003,0.17260000000000003,0.1695,0.19000000000000003,0.1491,0.1947,0.17490000000000003,0.16380000000000003,0.1685,0.15240000000000004,0.14350000000000002,0.16740000000000002,0.16630000000000003,0.14560000000000003,0.16960000000000003,0.17860000000000004,0.16830000000000003,0.16730000000000003,0.17410000000000003,0.15330000000000002,0.16290000000000002,0.17390000000000003,0.13540000000000002,0.17950000000000002,0.15650000000000003,0.15040000000000003,0.18420000000000003,0.13710000000000003,0.1801,0.1744,0.1743,0.1802,0.1708,0.1671,0.17300000000000001,0.184,0.1802,0.1942,0.189,0.1771,0.1765,0.2019,0.1656,0.2062,0.1861,0.1598,0.1903,0.18510000000000001,0.1807,0.2036,0.1862,0.1745,0.14650000000000002,0.1929,0.1805,0.1476,0.1878,0.1545,0.1714,0.1809,0.164,0.1595,0.1888,0.1529,0.202,0.1938,0.1841,0.18510000000000001,0.1916,0.1824,0.2013,0.2021,0.2,0.1575,0.1795],"cost_curve":[0.0062975,0.00806,0.0098225,0.01054,0.01103,0.010455,0.01051,0.01096,0.0109,0.01078,0.0121875,0.012835,0.0114825,0.0120275,0.01295,0.012945,0.013565,0.012945,0.0136325,0.012845,0.0122375,0.01358,0.013285,0.01235,0.0131825,0.0129675,0.0138725,0.0125675,0.0135125,0.0132125,0.014465,0.0131675,0.0133775,0.0132825,0.01343,0.01312,0.013725,0.0132475,0.014615,0.01617,0.0146725,0.0147175,0.01399,0.0139025,0.013495,0.01352,0.01304,0.0134425,0.01341,0.0144075,0.012745,0.0132275,0.0122525,0.01437,0.01461,0.013095,0.013325,0.0141625,0.014385,0.01371,0.01396,0.01415,0.0146725,0.012835,0.013365,0.013825,0.01501,0.0148475,0.01445,0.01556,0.01431,0.013595,0.0139925,0.0149025,0.0139925,0.0138625,0.0137875,0.015175,0.01385,0.013365,0.01409,0.01442,0.01417,0.0141675,0.01392,0.015555,0.0140575,0.0135625,0.01437,0.0144325]},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","run_index":2,"reward":29.613400000000013,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain":9.852000000000011,"normalized_reward":0.1402696507638208,"normalized_gain":0.14026475470752567,"cost_usd":0.909715,"latency_seconds":6.677736,"instance_count":90,"reward_curve":[0.2482,0.2811,0.2723,0.2716,0.2876,0.2669,0.2368,0.2493,0.232,0.2599,0.3552,0.3519,0.3539,0.3244,0.2811,0.2619,0.2332,0.2341,0.2657,0.2826,0.2796,0.2949,0.2456,0.2087,0.3181,0.2862,0.2874,0.3061,0.2597,0.2529,0.2874,0.3168,0.2561,0.2666,0.2622,0.3517,0.3751,0.3896,0.3653,0.3662,0.3288,0.2889,0.324,0.3854,0.3632,0.3806,0.3725,0.3423,0.3637,0.2542,0.4262,0.3919,0.5049,0.3416,0.4741,0.5023,0.3683,0.4961,0.343,0.3736,0.2835,0.4114,0.3899,0.3584,0.2523,0.4082,0.291,0.4216,0.4023,0.4219,0.4058,0.3728,0.4001,0.2739,0.3242,0.3098,0.3392,0.3702,0.3651,0.3109,0.3118,0.3734,0.4036,0.4032,0.3629,0.3559,0.3547,0.2585,0.2949,0.2305],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2346,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.03290000000000001,0.06059999999999999,0.04520000000000002,0.06350000000000003,0.05410000000000004,0.009500000000000008,0.05429999999999999,0.00990000000000002,0.04730000000000001,0.11480000000000001,0.12339999999999998,0.1346,0.07610000000000003,0.08910000000000001,0.06450000000000003,0.009300000000000003,0.007099999999999995,0.0592,0.03520000000000001,0.07780000000000001,0.093,0.03260000000000002,0.0003999999999999837,0.0937,0.0529,0.07799999999999999,0.09559999999999999,0.028499999999999998,0.04570000000000002,0.0892,0.10830000000000004,0.0466,0.06390000000000001,0.038699999999999984,0.1378,0.1722,0.1482,0.168,0.14590000000000003,0.10239999999999999,0.0963,0.08430000000000001,0.16380000000000003,0.13590000000000002,0.1532,0.151,0.1114,0.12910000000000002,0.025499999999999995,0.20850000000000002,0.17040000000000002,0.2974,0.12890000000000001,0.24950000000000003,0.27709999999999996,0.1685,0.26,0.14750000000000002,0.15799999999999997,0.04159999999999997,0.19999999999999998,0.17330000000000004,0.1374,0.054200000000000026,0.1927,0.06379999999999997,0.1664,0.19349999999999998,0.2007,0.1517,0.1589,0.1529,0.04359999999999997,0.10339999999999999,0.07210000000000003,0.097,0.15729999999999997,0.11629999999999999,0.11120000000000002,0.10390000000000002,0.15580000000000002,0.18700000000000003,0.1931,0.1436,0.1555,0.15510000000000002,0.05680000000000002,0.050699999999999995,0.008300000000000002],"cost_curve":[0.0059075,0.0075425,0.008155,0.007665,0.008835,0.008825,0.0088075,0.009895,0.009475,0.0100825,0.00956,0.0084425,0.0085875,0.008405,0.0092075,0.0102875,0.010145,0.0094675,0.0090825,0.0093675,0.00796,0.0081375,0.0075825,0.0076375,0.007265,0.0100725,0.0084475,0.0089325,0.009185,0.0101625,0.0102575,0.00915,0.0097425,0.0105,0.0096,0.0100375,0.0099625,0.01057,0.010735,0.01075,0.0099475,0.0098075,0.010135,0.00983,0.0098275,0.00988,0.0094425,0.01019,0.00961,0.0096725,0.01097,0.0102725,0.011965,0.0109525,0.0104425,0.01086,0.01144,0.010445,0.0109925,0.0105675,0.011015,0.0112825,0.012805,0.012285,0.01057,0.01083,0.011605,0.0115425,0.01116,0.01074,0.0118675,0.011505,0.011395,0.01088,0.0115625,0.0107925,0.0113025,0.01087,0.0114125,0.0109725,0.0105,0.0109875,0.0105975,0.0106625,0.011265,0.01097,0.0106625,0.01141,0.0122225,0.0123625]},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","run_index":3,"reward":24.642200000000006,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain":4.880800000000004,"normalized_reward":0.0694941556685033,"normalized_gain":0.06948885655465804,"cost_usd":0.91444,"latency_seconds":6.978367,"instance_count":90,"reward_curve":[0.192,0.2349,0.2127,0.2295,0.2172,0.2159,0.2274,0.3043,0.2583,0.3094,0.3055,0.3819,0.3625,0.3238,0.3007,0.3292,0.2847,0.2823,0.2753,0.2793,0.2577,0.2663,0.2712,0.2546,0.2684,0.2539,0.2294,0.2705,0.2586,0.2891,0.2502,0.3013,0.3068,0.3377,0.3306,0.281,0.2843,0.368,0.334,0.309,0.3051,0.225,0.2938,0.379,0.2906,0.3137,0.3385,0.2489,0.2934,0.2609,0.2477,0.2924,0.2966,0.2405,0.2631,0.2403,0.2612,0.3998,0.2985,0.2888,0.3113,0.3069,0.3947,0.3064,0.3354,0.3105,0.3271,0.2723,0.3109,0.3232,0.2078,0.1933,0.2096,0.1691,0.1137,0.1807,0.2677,0.2387,0.2455,0.1826,0.2371,0.1978,0.2085,0.1975,0.3235,0.2869,0.2245,0.2453,0.2551,0.2314],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2346,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.013300000000000006,0.0010000000000000009,0.0031000000000000194,-0.0068999999999999895,0.0031000000000000194,9.999999999998899e-05,0.10930000000000001,0.03619999999999998,0.0968,0.06509999999999999,0.1534,0.1432,0.07549999999999998,0.10870000000000002,0.1318,0.06080000000000002,0.05529999999999999,0.0688,0.031899999999999984,0.05589999999999998,0.06439999999999999,0.0582,0.04629999999999998,0.04400000000000004,0.020600000000000007,0.01999999999999999,0.060000000000000026,0.027400000000000008,0.08190000000000003,0.05199999999999999,0.09280000000000002,0.09730000000000003,0.135,0.1071,0.06710000000000002,0.0814,0.1266,0.13670000000000002,0.0887,0.07869999999999999,0.03240000000000001,0.05410000000000001,0.1574,0.06330000000000002,0.08629999999999999,0.11700000000000002,0.018000000000000016,0.05879999999999999,0.032200000000000034,0.03,0.07089999999999999,0.08909999999999998,0.02779999999999999,0.038500000000000006,0.015100000000000002,0.06139999999999998,0.16369999999999998,0.10299999999999998,0.07319999999999999,0.06940000000000002,0.0955,0.1781,0.0854,0.13729999999999998,0.095,0.09989999999999999,0.017100000000000004,0.1021,0.10199999999999998,-0.04629999999999998,-0.020600000000000007,-0.037599999999999995,-0.061200000000000004,-0.1071,-0.056999999999999995,0.025499999999999995,0.02579999999999999,-0.0032999999999999974,-0.017099999999999976,0.029200000000000004,-0.019799999999999984,-0.008099999999999996,-0.0126,0.10420000000000001,0.0865,0.024900000000000005,0.0436,0.010899999999999993,0.009199999999999986],"cost_curve":[0.0055675,0.007665,0.007975,0.00794,0.0081375,0.007135,0.0080575,0.0077775,0.0083275,0.00824,0.0076025,0.0085325,0.009245,0.00843,0.00931,0.00872,0.008915,0.0108375,0.0126525,0.00921,0.010105,0.0102675,0.0102625,0.01074,0.0105375,0.008665,0.009235,0.0101975,0.00974,0.0108825,0.0110025,0.01173,0.00949,0.01021,0.0102175,0.00965,0.00934,0.01008,0.011405,0.01134,0.011615,0.0121125,0.0123725,0.01114,0.0116775,0.01085,0.01048,0.0103575,0.01017,0.009915,0.0098975,0.01029,0.0099975,0.0110675,0.01033,0.00996,0.010035,0.010305,0.00968,0.0102175,0.01024,0.01079,0.011305,0.0120025,0.01108,0.01095,0.010985,0.0120575,0.0112875,0.0110275,0.0100625,0.0098775,0.01031,0.0098675,0.0103675,0.01005,0.01095,0.010225,0.010255,0.010455,0.0104175,0.0101325,0.0099875,0.0105825,0.01005,0.0108725,0.0118925,0.0123025,0.0118375,0.014375]},{"run_name":"icl-notepad-gpt-5.4","task":"blind_spectrum_monitoring","run_index":4,"reward":35.838199999999986,"baseline_reward":19.761400000000002,"reference_reward":90.0,"gain":16.076799999999984,"normalized_reward":0.2288927803641849,"normalized_gain":0.22888838900547542,"cost_usd":1.026015,"latency_seconds":7.439559,"instance_count":90,"reward_curve":[0.2273,0.2719,0.3534,0.3641,0.3179,0.3023,0.3365,0.3583,0.35,0.3725,0.3725,0.3788,0.4011,0.3961,0.3683,0.2403,0.3013,0.2891,0.3028,0.3033,0.3033,0.3333,0.3333,0.3333,0.3333,0.3333,0.413,0.413,0.413,0.413,0.413,0.413,0.413,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4057,0.4282,0.4282,0.4282,0.4375,0.4317,0.4539,0.4315,0.4315,0.4332,0.4349,0.4554,0.4304,0.4073,0.4608,0.3886,0.4733,0.4756,0.4745,0.4721,0.4755,0.3766,0.3885,0.3865,0.3911,0.351,0.2827,0.4361,0.4686,0.4755,0.5081,0.4646,0.4968,0.4609,0.4522,0.4385,0.4656,0.4797,0.486],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2346,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.02369999999999997,0.1417,0.1377,0.09380000000000002,0.08950000000000002,0.10920000000000002,0.1633,0.12789999999999999,0.1599,0.1321,0.15030000000000002,0.18180000000000002,0.14780000000000001,0.1763,0.04290000000000002,0.07740000000000002,0.062100000000000016,0.09630000000000002,0.055900000000000005,0.1015,0.1314,0.12029999999999999,0.12499999999999997,0.1089,0.09999999999999998,0.20359999999999998,0.20249999999999999,0.1818,0.20579999999999998,0.2148,0.2045,0.2035,0.203,0.1822,0.1918,0.2028,0.1643,0.2084,0.1854,0.17930000000000001,0.2131,0.166,0.1841,0.1784,0.17830000000000001,0.1842,0.1748,0.1711,0.17700000000000002,0.188,0.1842,0.22070000000000004,0.21550000000000002,0.20360000000000003,0.2123,0.23189999999999997,0.21780000000000002,0.236,0.21589999999999998,0.19129999999999997,0.2235,0.23880000000000004,0.2094,0.2092,0.2453,0.1614,0.21810000000000002,0.26680000000000004,0.25329999999999997,0.21800000000000003,0.26159999999999994,0.1294,0.1582,0.16570000000000001,0.1534,0.10879999999999998,0.0698,0.1873,0.26890000000000003,0.26759999999999995,0.2905,0.24800000000000003,0.2867,0.24159999999999998,0.2518,0.2389,0.2639,0.23550000000000001,0.2638],"cost_curve":[0.005695,0.00749,0.0097675,0.0130075,0.0136875,0.01231,0.0120425,0.0109625,0.01185,0.0129925,0.011885,0.0112275,0.01112,0.0105825,0.0106275,0.00927,0.008805,0.0109225,0.0108875,0.0125525,0.01154,0.00984,0.00935,0.0091925,0.0100725,0.0097675,0.0100875,0.009835,0.0103875,0.0107975,0.0101775,0.0102025,0.0107875,0.01042,0.010405,0.0107375,0.010325,0.0108775,0.010085,0.0102525,0.0096725,0.01069,0.0102275,0.0105725,0.010315,0.0104875,0.0104425,0.009925,0.011025,0.01023,0.009465,0.01046,0.01008,0.0106575,0.0115325,0.0111125,0.0120125,0.0112075,0.0109025,0.0111325,0.01118,0.01255,0.0124775,0.0123575,0.01267,0.0127375,0.0137575,0.0124125,0.01268,0.0126425,0.0119175,0.012785,0.0139,0.0131625,0.0137375,0.0137675,0.0134775,0.0133225,0.0134775,0.013175,0.0126025,0.012365,0.0124825,0.0129325,0.01312,0.013365,0.0130425,0.01446,0.0148775,0.013605]},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","run_index":0,"reward":9.825000000000001,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain":1.5750000000000028,"normalized_reward":0.03926701570680646,"normalized_gain":0.14651162790697697,"cost_usd":3.727678,"latency_seconds":10.111997,"instance_count":19,"reward_curve":[0.0,0.0,0.0,0.85,0.875,0.8,0.85,0.0,0.6,0.825,0.0,0.7,0.875,0.75,0.0,0.675,0.75,0.525,0.75],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.0,0.825,0.725,0.525,0.0,0.825,0.75,0.85,0.85,0.0,0.5,0.8,0.0,0.775],"gain_curve":[0.0,0.0,0.0,0.85,0.050000000000000044,0.8,0.025000000000000022,-0.725,0.07499999999999996,0.825,-0.825,-0.050000000000000044,0.025000000000000022,-0.09999999999999998,0.0,0.17500000000000004,-0.050000000000000044,0.525,-0.025000000000000022],"cost_curve":[0.080164,0.095602,0.260383,0.0887375,0.0697825,0.209314,0.145974,0.2262595,0.659109,0.1640735,0.102065,0.2020065,0.0719115,0.141768,0.266302,0.2005995,0.0927405,0.507452,0.143434]},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","run_index":1,"reward":10.200000000000001,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain":1.9500000000000028,"normalized_reward":0.07853403141361275,"normalized_gain":0.18139534883720954,"cost_usd":3.2902655,"latency_seconds":9.654896,"instance_count":19,"reward_curve":[0.725,0.825,0.7,0.0,0.775,0.0,0.65,0.0,0.0,0.65,0.0,0.7,0.875,0.75,0.0,0.95,0.9,0.925,0.775],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.0,0.825,0.725,0.525,0.0,0.825,0.75,0.85,0.85,0.0,0.5,0.8,0.0,0.775],"gain_curve":[0.725,0.825,0.7,0.0,-0.04999999999999993,0.0,-0.17499999999999993,-0.725,-0.525,0.65,-0.825,-0.050000000000000044,0.025000000000000022,-0.09999999999999998,0.0,0.44999999999999996,0.09999999999999998,0.925,0.0],"cost_curve":[0.1309835,0.140907,0.213188,0.104971,0.140062,0.175306,0.57542,0.03309,0.188803,0.3250245,0.073964,0.28417,0.130808,0.204412,0.169246,0.0482565,0.108505,0.0896935,0.1534555]},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","run_index":2,"reward":9.875000000000002,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain":1.6250000000000036,"normalized_reward":0.044502617801047376,"normalized_gain":0.15116279069767471,"cost_usd":3.326778,"latency_seconds":9.610626,"instance_count":19,"reward_curve":[0.85,0.625,0.0,0.0,0.7,0.0,0.75,0.0,0.775,0.0,0.6,0.75,0.85,0.525,0.875,0.875,0.0,0.775,0.925],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.0,0.825,0.725,0.525,0.0,0.825,0.75,0.85,0.85,0.0,0.5,0.8,0.0,0.775],"gain_curve":[0.85,0.625,0.0,0.0,-0.125,0.0,-0.07499999999999996,-0.725,0.25,0.0,-0.22499999999999998,0.0,0.0,-0.32499999999999996,0.875,0.375,-0.8,0.775,0.15000000000000002],"cost_curve":[0.052056,0.195121,0.062622,0.095958,0.184475,0.102256,0.2278875,0.240324,0.1727635,0.184069,0.361724,0.198712,0.0834705,0.459255,0.0904405,0.090476,0.1644125,0.244957,0.1157985]},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","run_index":3,"reward":6.950000000000001,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain":-1.2999999999999972,"normalized_reward":-0.26178010471204166,"normalized_gain":-0.12093023255813926,"cost_usd":3.00468,"latency_seconds":8.38311,"instance_count":19,"reward_curve":[0.0,0.775,0.0,0.7,0.0,0.0,0.0,0.875,0.925,0.9,0.775,0.825,0.0,0.0,0.725,0.0,0.45,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.0,0.825,0.725,0.525,0.0,0.825,0.75,0.85,0.85,0.0,0.5,0.8,0.0,0.775],"gain_curve":[0.0,0.775,0.0,0.7,-0.825,0.0,-0.825,0.15000000000000002,0.4,0.9,-0.04999999999999993,0.07499999999999996,-0.85,-0.85,0.725,-0.5,-0.35000000000000003,0.0,-0.775],"cost_curve":[0.12721,0.217212,0.1383775,0.2812985,0.1473555,0.1494925,0.1985295,0.0876865,0.0841525,0.074441,0.0992415,0.07158,0.1600615,0.2296055,0.183152,0.1657975,0.3744815,0.1203095,0.0946955]},{"run_name":"icl-notepad-gpt-5.4","task":"codebase_adaptation","run_index":4,"reward":7.45,"baseline_reward":8.249999999999998,"reference_reward":19.0,"gain":-0.799999999999998,"normalized_reward":-0.2094240837696334,"normalized_gain":-0.0744186046511626,"cost_usd":4.121649,"latency_seconds":10.111213,"instance_count":19,"reward_curve":[0.0,0.8,0.0,0.55,0.0,0.775,0.0,0.825,0.0,0.875,0.775,0.8,0.0,0.0,0.0,0.875,0.625,0.55,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.0,0.825,0.0,0.825,0.725,0.525,0.0,0.825,0.75,0.85,0.85,0.0,0.5,0.8,0.0,0.775],"gain_curve":[0.0,0.8,0.0,0.55,-0.825,0.775,-0.825,0.09999999999999998,-0.525,0.875,-0.04999999999999993,0.050000000000000044,-0.85,-0.85,0.0,0.375,-0.17500000000000004,0.55,-0.775],"cost_curve":[0.119224,0.098982,0.20123,0.377675,0.1988035,0.210799,0.186106,0.2506525,0.2301175,0.0680925,0.1240665,0.0863905,0.350377,0.1888985,0.2102115,0.1234065,0.585488,0.371606,0.1395225]},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","run_index":0,"reward":0.6228,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain":-0.7528000000000001,"normalized_reward":-0.16523378112858625,"normalized_gain":-0.4029029564770612,"cost_usd":4.600753,"latency_seconds":10.60495,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1877,0.1828,0.2101,0.0422],"baseline_reward_curve":[0.0,0.1053,0.0,0.0,0.0,0.0,0.1048,0.0,0.0,0.0,0.0,0.0,0.2004,0.0,0.3549,0.0,0.0,0.1899,0.2598,0.1605],"gain_curve":[0.0,-0.1053,0.0,0.0,0.0,0.0,-0.1048,0.0,0.0,0.0,0.0,0.0,-0.2004,0.0,-0.3549,0.0,0.1877,-0.007100000000000023,-0.049699999999999966,-0.1183],"cost_curve":[0.2191815,0.200033,0.133535,0.160707,0.1642725,0.3499785,0.370662,0.3264055,0.2139715,0.227709,0.2387395,0.253366,0.162416,0.2071035,0.194961,0.2319655,0.2849015,0.26939,0.1725885,0.218866]},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","run_index":1,"reward":0.5042,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain":-0.8714000000000002,"normalized_reward":-0.21795567093716936,"normalized_gain":-0.46637836912076397,"cost_usd":4.4389445,"latency_seconds":10.187753,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.0,0.0,0.0305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0615,0.0039,0.2116,0.1967,0.0,0.0],"baseline_reward_curve":[0.0,0.1053,0.0,0.0,0.0,0.0,0.1048,0.0,0.0,0.0,0.0,0.0,0.2004,0.0,0.3549,0.0,0.0,0.1899,0.2598,0.1605],"gain_curve":[0.0,-0.1053,0.0,0.0,0.0,0.0305,-0.1048,0.0,0.0,0.0,0.0,0.0,-0.2004,0.0,-0.2934,0.0039,0.2116,0.0068000000000000005,-0.2598,-0.1605],"cost_curve":[0.2281095,0.227963,0.2190135,0.186544,0.3214155,0.0518325,0.2282795,0.2896655,0.206717,0.203632,0.2527695,0.2752355,0.255188,0.1828175,0.1858955,0.146804,0.1994655,0.181303,0.269846,0.326448]},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","run_index":2,"reward":0.37239999999999995,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain":-1.0032,"normalized_reward":-0.27654542706508883,"normalized_gain":-0.5369184988546596,"cost_usd":4.359435,"latency_seconds":9.458219,"instance_count":20,"reward_curve":[0.0,0.0,0.0,0.1621,0.0809,0.0,0.0,0.0194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11],"baseline_reward_curve":[0.0,0.1053,0.0,0.0,0.0,0.0,0.1048,0.0,0.0,0.0,0.0,0.0,0.2004,0.0,0.3549,0.0,0.0,0.1899,0.2598,0.1605],"gain_curve":[0.0,-0.1053,0.0,0.1621,0.0809,0.0,-0.1048,0.0194,0.0,0.0,0.0,0.0,-0.2004,0.0,-0.3549,0.0,0.0,-0.1899,-0.2598,-0.0505],"cost_curve":[0.227279,0.228195,0.2707,0.1946285,0.27094,0.1622895,0.2262005,0.1658895,0.2553735,0.225695,0.2180085,0.2470505,0.1795915,0.2225235,0.218762,0.213835,0.2510585,0.1836415,0.2085935,0.18918]},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","run_index":3,"reward":0.493,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain":-0.8826000000000002,"normalized_reward":-0.22293446660206084,"normalized_gain":-0.4723726745306246,"cost_usd":4.305354,"latency_seconds":11.608331,"instance_count":20,"reward_curve":[0.0,0.0172,0.0,0.0,0.0,0.0,0.0,0.1006,0.0,0.0,0.0,0.0,0.0,0.0766,0.0,0.0,0.0,0.0,0.0756,0.223],"baseline_reward_curve":[0.0,0.1053,0.0,0.0,0.0,0.0,0.1048,0.0,0.0,0.0,0.0,0.0,0.2004,0.0,0.3549,0.0,0.0,0.1899,0.2598,0.1605],"gain_curve":[0.0,-0.08810000000000001,0.0,0.0,0.0,0.0,-0.1048,0.1006,0.0,0.0,0.0,0.0,-0.2004,0.0766,-0.3549,0.0,0.0,-0.1899,-0.18419999999999997,0.0625],"cost_curve":[0.312408,0.2014905,0.261966,0.225322,0.3313635,0.40777,0.1959905,0.1424395,0.27058,0.141621,0.1418395,0.162129,0.208904,0.176098,0.0403965,0.148354,0.2244105,0.285607,0.2297465,0.196918]},{"run_name":"icl-notepad-gpt-5.4","task":"cohort_studies","run_index":4,"reward":0.3859,"baseline_reward":1.3756000000000002,"reference_reward":3.24404,"gain":-0.9897000000000001,"normalized_reward":-0.2705442001475857,"normalized_gain":-0.5296932200124169,"cost_usd":4.488287,"latency_seconds":10.858145,"instance_count":20,"reward_curve":[0.0,0.0,0.3191,0.0,0.0,0.0,0.0013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0655,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.1053,0.0,0.0,0.0,0.0,0.1048,0.0,0.0,0.0,0.0,0.0,0.2004,0.0,0.3549,0.0,0.0,0.1899,0.2598,0.1605],"gain_curve":[0.0,-0.1053,0.3191,0.0,0.0,0.0,-0.10350000000000001,0.0,0.0,0.0,0.0,0.0,-0.2004,0.0,-0.3549,0.0,0.0655,-0.1899,-0.2598,-0.1605],"cost_curve":[0.2620055,0.2620465,0.275095,0.198469,0.3085975,0.1564945,0.175827,0.1095945,0.2326885,0.331353,0.177354,0.272263,0.316704,0.1166185,0.166299,0.2345125,0.2386715,0.2674585,0.1411705,0.2450645]},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","run_index":0,"reward":11.399999999999999,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain":5.3999999999999995,"normalized_reward":0.17021276595744678,"normalized_gain":0.1588235294117647,"cost_usd":1.2986395,"latency_seconds":3.42592,"instance_count":40,"reward_curve":[0.4,0.0,0.7333333333333334,0.0,0.6,0.8,0.6,0.4666666666666667,0.7333333333333334,0.0,0.7333333333333334,0.0,0.0,0.4,0.0,0.7333333333333334,0.0,0.0,0.7333333333333334,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.6,0.0,0.6666666666666667,0.0,0.0,0.0,0.6,0.0,0.6666666666666667,0.6],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.6,0.4,0.0,0.5333333333333333,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.5333333333333333,0.06666666666666665],"gain_curve":[0.4,0.0,0.7333333333333334,-0.33333333333333337,0.6,0.20000000000000007,0.19999999999999996,0.4666666666666667,0.20000000000000007,0.0,0.20000000000000007,-0.6666666666666667,-0.2666666666666667,-0.19999999999999996,0.0,0.7333333333333334,0.0,0.0,0.7333333333333334,0.4666666666666667,0.0,0.0,0.0,-0.6,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.6,0.0,0.6666666666666667,0.0,0.0,-0.33333333333333337,0.06666666666666665,0.0,0.13333333333333341,0.5333333333333333],"cost_curve":[0.047653,0.0197565,0.028394,0.022879,0.0341405,0.021204,0.034902,0.048049,0.02139,0.032009,0.0246355,0.0436795,0.030164,0.044502,0.0469215,0.024848,0.0373495,0.0148775,0.022138,0.041392,0.0191745,0.029673,0.0204565,0.031385,0.024309,0.035614,0.059112,0.01306,0.0281225,0.0527195,0.0331745,0.0270205,0.029358,0.036843,0.047875,0.029875,0.044864,0.0297865,0.0229805,0.042352]},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","run_index":1,"reward":10.866666666666667,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain":4.866666666666668,"normalized_reward":0.1547388781431335,"normalized_gain":0.14313725490196083,"cost_usd":1.6146935,"latency_seconds":3.870871,"instance_count":40,"reward_curve":[0.4,0.6,0.6666666666666667,0.0,0.0,0.0,0.0,0.7333333333333334,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.8,0.6,0.0,0.6666666666666667,0.0,0.0,0.6,0.0,0.0,0.33333333333333337,0.0,0.0,0.0,0.6666666666666667,0.7333333333333334,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.7333333333333334,0.9333333333333333,0.6666666666666667,0.8],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.6,0.4,0.0,0.5333333333333333,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.5333333333333333,0.06666666666666665],"gain_curve":[0.4,0.6,0.6666666666666667,-0.33333333333333337,0.0,-0.6,-0.4,0.7333333333333334,-0.5333333333333333,0.0,-0.5333333333333333,-0.6666666666666667,-0.2666666666666667,-0.19999999999999996,0.0,0.8,0.6,0.0,0.6666666666666667,0.0,0.0,0.6,0.0,-0.6,0.33333333333333337,0.0,0.0,0.0,0.6666666666666667,0.7333333333333334,0.0,0.0,0.0,0.0,0.0,0.19999999999999996,0.20000000000000007,0.9333333333333333,0.13333333333333341,0.7333333333333334],"cost_curve":[0.0577605,0.030447,0.031949,0.021999,0.0276105,0.0807935,0.0302235,0.025705,0.0477135,0.0820815,0.038873,0.0210015,0.028984,0.043696,0.052182,0.0168145,0.042251,0.0342545,0.039071,0.048205,0.075243,0.0403905,0.040399,0.030574,0.056828,0.0476465,0.0291505,0.0253615,0.0330705,0.039954,0.04628,0.015955,0.0472905,0.0370315,0.01517,0.0683995,0.058656,0.01795,0.0759225,0.0118055]},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","run_index":2,"reward":15.866666666666665,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain":9.866666666666667,"normalized_reward":0.29980657640232106,"normalized_gain":0.2901960784313726,"cost_usd":1.3635355,"latency_seconds":3.59954,"instance_count":40,"reward_curve":[0.0,0.0,0.6,0.6666666666666667,0.6666666666666667,0.6,0.6,0.6666666666666667,0.0,0.0,0.0,0.6,0.0,0.0,0.6,0.4666666666666667,0.7333333333333334,0.8,0.7333333333333334,0.0,0.7333333333333334,0.0,0.5333333333333333,0.0,0.0,0.0,0.4666666666666667,0.7333333333333334,0.7333333333333334,0.6666666666666667,0.0,0.6666666666666667,0.6666666666666667,0.0,0.0,0.8,0.7333333333333334,0.0,0.5333333333333333,0.8666666666666667],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.6,0.4,0.0,0.5333333333333333,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.5333333333333333,0.06666666666666665],"gain_curve":[0.0,0.0,0.6,0.33333333333333337,0.6666666666666667,0.0,0.19999999999999996,0.6666666666666667,-0.5333333333333333,0.0,-0.5333333333333333,-0.06666666666666676,-0.2666666666666667,-0.6,0.6,0.4666666666666667,0.7333333333333334,0.8,0.7333333333333334,0.0,0.7333333333333334,0.0,0.5333333333333333,-0.6,0.0,0.0,0.4666666666666667,0.7333333333333334,0.7333333333333334,0.6666666666666667,0.0,0.6666666666666667,0.6666666666666667,0.0,0.0,0.4666666666666667,0.20000000000000007,0.0,0.0,0.8],"cost_curve":[0.0299675,0.1021025,0.038085,0.020173,0.0322565,0.024574,0.0373515,0.0286245,0.0346825,0.083133,0.020837,0.0298845,0.031292,0.031181,0.0373275,0.0423945,0.025891,0.015297,0.023029,0.032737,0.021239,0.027364,0.0382805,0.040501,0.0262905,0.0062375,0.0606435,0.0216725,0.022697,0.0294435,0.0313575,0.032962,0.034795,0.018785,0.045075,0.0288275,0.036963,0.0359055,0.061816,0.0218595]},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","run_index":3,"reward":14.200000000000001,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain":8.200000000000003,"normalized_reward":0.2514506769825919,"normalized_gain":0.24117647058823538,"cost_usd":1.3599125,"latency_seconds":3.327074,"instance_count":40,"reward_curve":[0.0,0.4666666666666667,0.4666666666666667,0.0,0.8,0.6666666666666667,0.6666666666666667,0.8,0.0,0.7333333333333334,0.2666666666666667,0.6666666666666667,0.0,0.0,0.0,0.5333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,0.0,0.7333333333333334,0.7333333333333334,0.0,0.8,0.6,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.5333333333333333,0.6666666666666667,0.8,0.6],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.6,0.4,0.0,0.5333333333333333,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.5333333333333333,0.06666666666666665],"gain_curve":[0.0,0.4666666666666667,0.4666666666666667,-0.33333333333333337,0.8,0.06666666666666676,0.2666666666666667,0.8,-0.5333333333333333,0.7333333333333334,-0.2666666666666666,0.0,-0.2666666666666667,-0.6,0.0,0.5333333333333333,0.8666666666666667,0.0,0.0,0.0,0.0,0.5333333333333333,0.6,-0.6,0.7333333333333334,0.7333333333333334,0.0,0.8,0.6,0.6666666666666667,0.0,0.0,0.0,0.0,0.0,-0.33333333333333337,0.0,0.6666666666666667,0.2666666666666667,0.5333333333333333],"cost_curve":[0.041572,0.0384725,0.029558,0.015897,0.016337,0.032352,0.029652,0.017167,0.0350355,0.0190675,0.0548165,0.027933,0.036025,0.0743495,0.0210345,0.0264645,0.0143,0.0188835,0.021585,0.033507,0.0283575,0.036756,0.0421295,0.0197425,0.046224,0.0424175,0.0439585,0.0275285,0.061136,0.02787,0.028285,0.0996485,0.0531525,0.0256315,0.0212695,0.0328535,0.033189,0.025982,0.02525,0.034522]},{"run_name":"icl-notepad-gpt-5.4","task":"database_exploration","run_index":4,"reward":9.533333333333335,"baseline_reward":5.999999999999999,"reference_reward":40.0,"gain":3.533333333333336,"normalized_reward":0.11605415860735016,"normalized_gain":0.10392156862745106,"cost_usd":1.613971,"latency_seconds":3.465074,"instance_count":40,"reward_curve":[0.0,0.0,0.6666666666666667,0.0,0.0,0.0,0.0,0.33333333333333337,0.0,0.7333333333333334,0.2666666666666667,0.0,0.0,0.6,0.4666666666666667,0.7333333333333334,0.0,0.7333333333333334,0.0,0.0,0.0,0.6,0.5333333333333333,0.0,0.6666666666666667,0.4666666666666667,0.0,0.0,0.2666666666666667,0.0,0.6666666666666667,0.0,0.6666666666666667,0.4,0.0,0.0,0.0,0.7333333333333334,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.6,0.4,0.0,0.5333333333333333,0.0,0.5333333333333333,0.6666666666666667,0.2666666666666667,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.5333333333333333,0.0,0.5333333333333333,0.06666666666666665],"gain_curve":[0.0,0.0,0.6666666666666667,-0.33333333333333337,0.0,-0.6,-0.4,0.33333333333333337,-0.5333333333333333,0.7333333333333334,-0.2666666666666666,-0.6666666666666667,-0.2666666666666667,0.0,0.4666666666666667,0.7333333333333334,0.0,0.7333333333333334,0.0,0.0,0.0,0.6,0.5333333333333333,-0.6,0.6666666666666667,0.4666666666666667,0.0,0.0,0.2666666666666667,0.0,0.6666666666666667,0.0,0.6666666666666667,0.4,0.0,-0.33333333333333337,-0.5333333333333333,0.7333333333333334,-0.5333333333333333,-0.06666666666666665],"cost_curve":[0.0368,0.037297,0.0292325,0.027453,0.0158825,0.084119,0.0842365,0.0477455,0.0563905,0.0204305,0.075695,0.0199065,0.0971655,0.0417875,0.0659115,0.0379415,0.0231945,0.024443,0.0338395,0.048346,0.027482,0.0324725,0.038715,0.0367065,0.0281555,0.056256,0.0246125,0.02597,0.065235,0.020627,0.0334,0.028104,0.0282765,0.067131,0.0433285,0.043387,0.031491,0.024749,0.0204645,0.02959]},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","run_index":0,"reward":86.5,"baseline_reward":81.1,"reference_reward":1138.5,"gain":5.400000000000006,"normalized_reward":-0.04687033535675191,"normalized_gain":0.0051068658974844005,"cost_usd":1.5249935,"latency_seconds":4.310811,"instance_count":120,"reward_curve":[-2.0,2.0,-1.0,19.0,-1.0,1.0,4.0,-6.5,31.5,-4.5,6.0,-7.0,4.0,7.0,-4.0,-4.5,3.0,-2.0,6.0,8.0,2.0,-5.0,-1.0,2.0,1.0,-1.0,0.5,-1.0,0.5,-1.0,2.0,0.5,2.0,-1.0,0.5,0.5,3.0,0.0,1.0,10.0,-1.0,3.0,-1.0,1.0,2.0,-1.0,2.0,-1.0,1.0,2.0,3.0,2.0,-6.5,-7.0,2.0,8.0,4.0,-8.0,-7.0,-2.0,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,1.0,12.0,0.5,-3.0,-3.0,4.0,-2.0,0.5,-2.0,3.0,-3.0,0.5,4.0,1.0,3.0,-4.0,1.0,3.0,1.0,-4.0,0.5,-2.0,-7.0,-6.0,0.5,0.5,1.0,-1.0,-1.0,1.0,0.5,-1.0,-4.0,1.0,-1.0,-2.0,0.5,-11.0,2.0,-2.0,5.0,2.0,2.0,-3.0,-1.0,2.0,21.0,4.0,0.5,-3.0],"baseline_reward_curve":[-2.0,1.0,-1.0,22.0,-4.0,3.0,7.0,-7.0,12.5,-17.0,1.0,-8.0,2.0,19.0,-4.0,-2.0,4.0,-3.0,5.6,8.0,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,2.0,-3.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-2.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,-0.5,2.0,3.0,2.0,-3.0,-7.0,2.0,4.5,-0.5,-7.0,-2.0,-2.0,1.0,0.5,0.5,-0.5,1.0,1.0,1.0,1.0,1.0,-3.0,2.0,12.5,0.5,-2.0,-2.0,1.0,-0.5,0.5,-1.0,4.0,-2.0,0.5,4.0,1.0,3.0,-10.0,1.0,8.0,1.0,-2.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,-1.0,-3.0,3.5,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-13.0,2.0,2.0,3.0,-0.5,2.0,3.0,-1.0,2.0,20.0,2.0,0.5,-3.0],"gain_curve":[0.0,1.0,0.0,-3.0,3.0,-2.0,-3.0,0.5,19.0,12.5,5.0,1.0,2.0,-12.0,0.0,-2.5,-1.0,1.0,0.40000000000000036,0.0,0.0,0.0,-0.5,0.0,0.0,-0.5,0.0,-0.5,0.0,-0.5,0.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0,0.0,-2.0,0.0,5.0,0.0,0.0,0.0,4.0,-8.0,0.0,1.5,0.0,0.0,0.0,-3.5,0.0,0.0,3.5,4.5,-1.0,-5.0,0.0,0.0,0.0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-0.5,0.0,-1.0,-1.0,3.0,-1.5,0.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,6.0,0.0,-5.0,0.0,-2.0,0.0,0.0,-4.0,-4.0,0.0,0.0,0.0,0.0,2.0,-2.5,0.0,0.0,-1.0,0.0,0.0,-4.0,0.0,2.0,0.0,-4.0,2.0,2.5,0.0,-6.0,0.0,0.0,1.0,2.0,0.0,0.0],"cost_curve":[0.0164625,0.014986,0.0149535,0.018163,0.015633,0.0165185,0.0159105,0.01781,0.015498,0.019923,0.0160055,0.0174555,0.015958,0.01761,0.0154605,0.022203,0.016475,0.015371,0.0160785,0.0150285,0.016336,0.0250075,0.0076725,0.01206,0.0162735,0.0069275,0.0,0.0072825,0.0,0.0071275,0.018001,0.0,0.0202755,0.0076975,0.0,0.0,0.01668,0.016228,0.013615,0.020558,0.0129175,0.019567,0.0078,0.0085375,0.01325,0.00808,0.0125875,0.0083275,0.0130525,0.01311,0.0180855,0.0156435,0.015286,0.0153885,0.0152285,0.0156655,0.0159285,0.014876,0.0159785,0.0154985,0.01127,0.0,0.0,0.014151,0.013005,0.0068,0.0110775,0.0069925,0.0069675,0.0155285,0.0161755,0.0158785,0.0,0.017558,0.016053,0.00733,0.0155435,0.0,0.016958,0.015618,0.016043,0.0,0.0159955,0.0136725,0.00696,0.014681,0.002965,0.006635,0.0072475,0.0156105,0.0,0.0167605,0.0168155,0.0158055,0.0,0.0,0.00817,0.0087775,0.009905,0.0068625,0.0,0.0068775,0.0177335,0.0127775,0.0071075,0.0193805,0.0,0.0306395,0.0118485,0.020175,0.0324255,0.023945,0.013163,0.0199505,0.0099975,0.0177275,0.028158,0.0108225,0.0,0.0323975]},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","run_index":1,"reward":92.0,"baseline_reward":81.1,"reference_reward":1138.5,"gain":10.900000000000006,"normalized_reward":-0.04139715394566623,"normalized_gain":0.01030830338566295,"cost_usd":1.5445485,"latency_seconds":4.287262,"instance_count":120,"reward_curve":[34.0,9.0,1.0,-2.0,4.0,8.0,-4.0,-3.0,4.0,4.8,-4.0,14.0,2.0,-4.5,23.0,-8.8,-7.0,-4.0,-16.0,2.0,-2.0,-1.0,0.0,1.0,-1.0,0.5,10.0,-1.0,2.0,0.5,2.0,-1.0,0.5,-0.5,-1.0,0.5,-1.0,2.0,-1.0,0.5,-1.0,-0.5,-1.0,1.0,2.0,10.0,-3.0,-5.0,-1.0,-1.0,-7.0,5.5,-2.0,-7.0,-7.0,2.0,2.0,3.0,4.0,-2.0,8.0,1.0,-2.0,5.0,-2.0,-2.0,0.5,0.5,-3.0,3.0,0.5,1.0,1.0,1.0,1.0,-4.0,1.0,4.0,-6.0,-1.0,0.5,0.5,4.0,0.5,-7.0,1.0,12.0,-5.0,-1.0,1.0,0.5,-1.0,1.0,1.0,1.0,-5.0,0.5,-1.0,5.0,2.0,-11.0,2.0,-1.0,0.5,-2.0,2.0,-4.0,1.0,0.5,4.0,2.0,2.0,21.0,-1.0,2.0,0.5,1.0,1.0,-3.0,2.0],"baseline_reward_curve":[-2.0,1.0,-1.0,22.0,-4.0,3.0,7.0,-7.0,12.5,-17.0,1.0,-8.0,2.0,19.0,-4.0,-2.0,4.0,-3.0,5.6,8.0,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,2.0,-3.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-2.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,-0.5,2.0,3.0,2.0,-3.0,-7.0,2.0,4.5,-0.5,-7.0,-2.0,-2.0,1.0,0.5,0.5,-0.5,1.0,1.0,1.0,1.0,1.0,-3.0,2.0,12.5,0.5,-2.0,-2.0,1.0,-0.5,0.5,-1.0,4.0,-2.0,0.5,4.0,1.0,3.0,-10.0,1.0,8.0,1.0,-2.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,-1.0,-3.0,3.5,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-13.0,2.0,2.0,3.0,-0.5,2.0,3.0,-1.0,2.0,20.0,2.0,0.5,-3.0],"gain_curve":[36.0,8.0,2.0,-24.0,8.0,5.0,-11.0,4.0,-8.5,21.8,-5.0,22.0,0.0,-23.5,27.0,-6.800000000000001,-11.0,-1.0,-21.6,-6.0,-4.0,4.0,0.5,-1.0,-2.0,1.0,9.5,-0.5,1.5,1.0,0.0,-1.5,-1.5,2.5,-1.5,0.0,0.0,2.0,-2.0,-11.5,0.0,1.5,0.0,0.0,0.0,15.0,-13.0,-4.0,-0.5,-3.0,-10.0,3.5,1.0,0.0,-9.0,-2.5,2.5,10.0,6.0,0.0,7.0,0.5,-2.5,5.5,-3.0,-3.0,-0.5,-0.5,-4.0,6.0,-1.5,-11.5,0.5,3.0,3.0,-5.0,1.5,3.5,-5.0,-5.0,2.5,0.0,0.0,-0.5,-10.0,11.0,11.0,-13.0,-2.0,3.0,0.0,1.0,4.0,3.0,0.5,-5.5,-0.5,0.0,8.0,-1.5,-11.5,3.0,2.0,-0.5,-1.0,0.0,-4.5,14.0,-1.5,2.0,-1.0,2.5,19.0,-4.0,3.0,-1.5,-19.0,-1.0,-3.5,5.0],"cost_curve":[0.01754,0.015838,0.015241,0.0171525,0.0138335,0.015811,0.016908,0.01777,0.014926,0.017661,0.0158775,0.0163205,0.015656,0.0162735,0.0159185,0.0167185,0.0142585,0.017455,0.018985,0.015326,0.01221,0.0062125,0.016648,0.0116025,0.0069825,0.0,0.0184305,0.0071575,0.01277,0.0,0.0199405,0.0077075,0.0,0.003685,0.0084625,0.0,0.0174525,0.018075,0.01742,0.0,0.0081625,0.0038825,0.0082475,0.0183725,0.0192535,0.0268815,0.0245355,0.0399985,0.009515,0.00947,0.0179495,0.015436,0.018415,0.0164085,0.015566,0.015101,0.0158905,0.015611,0.015246,0.015391,0.018423,0.007265,0.0176225,0.0175105,0.0174085,0.0156305,0.0,0.0,0.0171975,0.00835,0.0,0.007665,0.01157,0.00747,0.01216,0.016313,0.00358,0.01238,0.016546,0.0162805,0.0,0.0,0.016025,0.0,0.016528,0.0082825,0.016678,0.016908,0.015728,0.0123425,0.0,0.016053,0.008175,0.007775,0.0159455,0.0183125,0.0,0.0073775,0.0281665,0.01793,0.021688,0.0163475,0.008065,0.0,0.0214525,0.0139175,0.023985,0.0123805,0.0,0.009205,0.015701,0.011856,0.0299895,0.01019,0.0129655,0.0,0.0097275,0.009825,0.018631,0.013463]},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","run_index":2,"reward":112.0,"baseline_reward":81.1,"reference_reward":1138.5,"gain":30.900000000000006,"normalized_reward":-0.02149467608717285,"normalized_gain":0.029222621524494046,"cost_usd":1.6368395,"latency_seconds":4.636741,"instance_count":120,"reward_curve":[13.2,-4.0,-7.5,5.4,-1.0,4.0,-8.0,-4.8,-8.0,3.0,8.0,-2.0,-2.0,9.6,4.0,3.0,47.0,-2.0,4.0,8.0,0.0,-0.5,2.0,-1.0,-0.5,0.5,1.0,-2.0,0.5,0.5,2.0,1.0,1.0,-5.0,-1.0,0.5,-1.0,-1.0,-1.0,-3.0,-1.0,10.0,-1.0,10.0,-1.0,0.5,2.0,3.0,-1.0,-2.0,19.5,2.0,-4.0,-9.0,-2.0,2.0,13.6,-4.0,3.0,-8.5,0.5,11.5,0.5,-3.0,0.5,1.0,1.0,-4.0,-3.0,1.0,-3.0,4.0,1.0,-2.0,-4.0,3.0,1.0,1.0,-2.0,1.0,1.0,1.0,0.5,2.0,16.0,0.5,0.5,-4.0,-16.0,4.0,1.0,-6.0,-2.0,0.5,1.0,-4.0,1.0,-2.0,3.0,2.0,0.5,3.0,-13.0,3.0,-4.0,-1.0,5.0,0.5,-1.0,12.0,2.0,1.0,-1.0,-1.0,-1.0,0.5,2.0,0.5,1.0,2.0],"baseline_reward_curve":[-2.0,1.0,-1.0,22.0,-4.0,3.0,7.0,-7.0,12.5,-17.0,1.0,-8.0,2.0,19.0,-4.0,-2.0,4.0,-3.0,5.6,8.0,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,2.0,-3.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-2.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,-0.5,2.0,3.0,2.0,-3.0,-7.0,2.0,4.5,-0.5,-7.0,-2.0,-2.0,1.0,0.5,0.5,-0.5,1.0,1.0,1.0,1.0,1.0,-3.0,2.0,12.5,0.5,-2.0,-2.0,1.0,-0.5,0.5,-1.0,4.0,-2.0,0.5,4.0,1.0,3.0,-10.0,1.0,8.0,1.0,-2.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,-1.0,-3.0,3.5,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-13.0,2.0,2.0,3.0,-0.5,2.0,3.0,-1.0,2.0,20.0,2.0,0.5,-3.0],"gain_curve":[15.2,-5.0,-6.5,-16.6,3.0,1.0,-15.0,2.2,-20.5,20.0,7.0,6.0,-4.0,-9.4,8.0,5.0,43.0,1.0,-1.5999999999999996,0.0,-2.0,4.5,2.5,-3.0,-1.5,1.0,0.5,-1.5,0.0,1.0,0.0,0.5,-1.0,-2.0,-1.5,0.0,0.0,-1.0,-2.0,-15.0,0.0,12.0,0.0,9.0,-3.0,5.5,-8.0,4.0,-0.5,-4.0,16.5,0.0,-1.0,-2.0,-4.0,-2.5,14.1,3.0,5.0,-6.5,-0.5,11.0,0.0,-2.5,-0.5,0.0,0.0,-5.0,-4.0,4.0,-5.0,-8.5,0.5,0.0,-2.0,2.0,1.5,0.5,-1.0,-3.0,3.0,0.5,-3.5,1.0,13.0,10.5,-0.5,-12.0,-17.0,6.0,0.5,-4.0,1.0,2.5,0.5,-4.5,0.0,-1.0,6.0,-1.5,0.0,4.0,-10.0,2.0,-3.0,-3.0,4.5,13.5,-3.0,10.0,-1.0,1.5,-3.0,-4.0,0.0,-1.5,-18.0,-1.5,0.5,5.0],"cost_curve":[0.0175985,0.015416,0.0166885,0.0162835,0.014141,0.0155185,0.0146935,0.016451,0.0152635,0.0155135,0.0153835,0.014896,0.015841,0.016071,0.015406,0.015316,0.015716,0.016316,0.0181425,0.015221,0.013406,0.00328,0.0149485,0.0066725,0.0040125,0.0,0.0109125,0.0138325,0.0,0.0,0.014925,0.0118075,0.0080075,0.024179,0.011475,0.0,0.0071825,0.0074175,0.008975,0.0191305,0.010485,0.02479,0.00784,0.017558,0.008225,0.0,0.0136475,0.021377,0.007825,0.022273,0.0161405,0.015121,0.015776,0.0150285,0.0140185,0.0159785,0.0158535,0.0144135,0.0154085,0.0176255,0.0,0.019766,0.0,0.019986,0.0,0.013346,0.021341,0.0165255,0.0193535,0.0236025,0.018572,0.0162245,0.0094375,0.0168175,0.0165945,0.0096375,0.0092325,0.004015,0.015865,0.013278,0.012333,0.0093425,0.0,0.021403,0.0176645,0.0,0.0,0.0241245,0.022185,0.0097675,0.009795,0.019587,0.027141,0.0,0.0087385,0.0310535,0.0128425,0.015177,0.0080305,0.0155645,0.0,0.0306585,0.0344215,0.018656,0.0450005,0.0137105,0.032948,0.0,0.0096795,0.0276875,0.0172775,0.019941,0.0083225,0.0120925,0.0090925,0.0,0.0158955,0.0,0.015371,0.012316]},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","run_index":3,"reward":23.0,"baseline_reward":81.1,"reference_reward":1138.5,"gain":-58.099999999999994,"normalized_reward":-0.1100607025574684,"normalized_gain":-0.05494609419330432,"cost_usd":1.5082525,"latency_seconds":4.312497,"instance_count":120,"reward_curve":[3.0,14.0,2.0,2.0,-3.0,-10.5,4.0,-7.0,-8.0,5.4,-6.5,2.0,-6.5,2.0,-2.0,-2.0,10.0,-7.5,12.0,16.5,1.0,-0.5,-1.0,2.0,2.0,-2.0,2.0,-1.0,2.0,-1.0,2.0,0.5,0.5,1.0,0.5,-0.5,0.0,-5.0,-1.0,2.0,10.0,-1.0,-3.0,-1.0,0.5,2.0,1.0,-7.5,0.5,1.0,-16.0,-8.0,4.0,4.0,4.5,2.4,-8.5,-1.0,1.0,-2.0,1.0,3.0,0.5,16.0,1.0,5.2,-2.0,1.0,-3.0,0.5,-2.0,4.0,0.5,-3.0,1.0,-3.0,1.0,-2.0,0.5,-7.0,4.0,0.5,0.5,-22.0,-5.0,1.0,1.0,1.0,0.5,-4.0,1.0,-4.0,4.0,-2.0,4.0,-13.0,19.0,2.0,0.5,0.5,2.0,3.0,4.0,0.5,1.0,-1.0,1.0,-1.0,3.0,-1.0,0.5,5.0,-1.0,2.0,-2.0,-1.0,2.0,-1.0,-4.0,2.0],"baseline_reward_curve":[-2.0,1.0,-1.0,22.0,-4.0,3.0,7.0,-7.0,12.5,-17.0,1.0,-8.0,2.0,19.0,-4.0,-2.0,4.0,-3.0,5.6,8.0,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,2.0,-3.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-2.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,-0.5,2.0,3.0,2.0,-3.0,-7.0,2.0,4.5,-0.5,-7.0,-2.0,-2.0,1.0,0.5,0.5,-0.5,1.0,1.0,1.0,1.0,1.0,-3.0,2.0,12.5,0.5,-2.0,-2.0,1.0,-0.5,0.5,-1.0,4.0,-2.0,0.5,4.0,1.0,3.0,-10.0,1.0,8.0,1.0,-2.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,-1.0,-3.0,3.5,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-13.0,2.0,2.0,3.0,-0.5,2.0,3.0,-1.0,2.0,20.0,2.0,0.5,-3.0],"gain_curve":[5.0,13.0,3.0,-20.0,1.0,-13.5,-3.0,0.0,-20.5,22.4,-7.5,10.0,-8.5,-17.0,2.0,0.0,6.0,-4.5,6.4,8.5,-1.0,4.5,-0.5,0.0,1.0,-1.5,1.5,-0.5,1.5,-0.5,0.0,0.0,-1.5,4.0,0.0,-1.0,1.0,-5.0,-2.0,-10.0,11.0,1.0,-2.0,-2.0,-1.5,7.0,-9.0,-6.5,1.0,-1.0,-19.0,-10.0,7.0,11.0,2.5,-2.1,-8.0,6.0,3.0,0.0,0.0,2.5,0.0,16.5,0.0,4.2,-3.0,0.0,-4.0,3.5,-4.0,-8.5,0.0,-1.0,3.0,-4.0,1.5,-2.5,1.5,-11.0,6.0,0.0,-3.5,-23.0,-8.0,11.0,0.0,-7.0,-0.5,-2.0,0.5,-2.0,7.0,0.0,3.5,-13.5,18.0,3.0,3.5,-3.0,1.5,4.0,7.0,-0.5,2.0,-3.0,0.5,12.0,1.0,-3.0,-2.5,5.5,-3.0,-1.0,-1.0,-3.0,-18.0,-3.0,-4.5,5.0],"cost_curve":[0.018,0.0169155,0.015596,0.0156485,0.0196975,0.01869,0.0179685,0.015693,0.016488,0.016043,0.015718,0.019715,0.0185875,0.0162455,0.015723,0.0158705,0.020095,0.017248,0.0162505,0.016985,0.0176205,0.0030225,0.011645,0.01142,0.0106875,0.0109975,0.0114175,0.0063275,0.017641,0.0070525,0.00732,0.0,0.0,0.007195,0.0,0.003515,0.0189005,0.0257275,0.007595,0.0129425,0.026783,0.00728,0.01674,0.007205,0.0,0.0114285,0.0137825,0.0205645,0.0,0.020561,0.0200005,0.016173,0.015701,0.0163155,0.0155635,0.0164235,0.015788,0.0222505,0.0153025,0.0157475,0.0086475,0.008535,0.0,0.0144805,0.00945,0.015141,0.0138785,0.014636,0.0147785,0.0,0.015928,0.014606,0.0,0.017228,0.0035125,0.018076,0.0080675,0.0158805,0.0,0.0203025,0.0175175,0.0,0.0,0.0156235,0.0159455,0.00865,0.007855,0.0034925,0.0,0.0175405,0.0144275,0.022353,0.0080525,0.0219125,0.0086,0.0242945,0.027632,0.0117775,0.0,0.0,0.012705,0.00703,0.012765,0.0,0.0120675,0.006395,0.0068525,0.007125,0.0238235,0.00728,0.0,0.027094,0.0074725,0.0157355,0.0146575,0.015636,0.016388,0.006835,0.0277685,0.013985]},{"run_name":"icl-notepad-gpt-5.4","task":"exploitable_poker","run_index":4,"reward":93.2,"baseline_reward":81.1,"reference_reward":1138.5,"gain":12.100000000000009,"normalized_reward":-0.04020300527415663,"normalized_gain":0.01144316247399282,"cost_usd":1.3768895,"latency_seconds":4.27148,"instance_count":120,"reward_curve":[7.0,-4.0,21.0,9.0,-7.0,4.0,4.0,2.0,2.0,-4.0,-8.0,-7.0,3.0,13.1,-13.4,-2.0,-5.0,2.0,4.0,-1.0,1.0,0.5,10.0,1.0,-1.0,2.0,2.0,-2.0,4.0,10.0,-2.0,-0.5,0.5,-0.5,-1.0,-2.0,1.0,0.0,-1.0,-1.0,0.5,1.0,1.0,0.5,0.5,2.0,1.0,-1.0,2.0,-1.0,-1.0,-7.0,-6.5,-2.0,3.0,2.0,-2.0,2.0,1.0,2.0,1.0,10.0,1.0,0.5,-0.5,-2.0,-1.0,1.0,4.0,0.5,-3.0,4.0,1.0,-2.0,4.0,2.0,-2.0,0.5,0.5,1.0,0.5,1.0,1.0,-2.0,-9.0,0.5,3.5,-2.0,19.0,1.0,0.5,-1.0,1.0,-2.0,1.0,1.0,-3.0,-1.0,-1.0,-1.0,0.5,4.0,2.0,-2.0,3.0,2.0,17.0,0.5,0.5,2.0,5.0,-1.0,4.0,3.0,0.5,2.0,1.0,-13.0,-4.0,-1.0],"baseline_reward_curve":[-2.0,1.0,-1.0,22.0,-4.0,3.0,7.0,-7.0,12.5,-17.0,1.0,-8.0,2.0,19.0,-4.0,-2.0,4.0,-3.0,5.6,8.0,2.0,-5.0,-0.5,2.0,1.0,-0.5,0.5,-0.5,0.5,-0.5,2.0,0.5,2.0,-3.0,0.5,0.5,-1.0,0.0,1.0,12.0,-1.0,-2.0,-1.0,1.0,2.0,-5.0,10.0,-1.0,-0.5,2.0,3.0,2.0,-3.0,-7.0,2.0,4.5,-0.5,-7.0,-2.0,-2.0,1.0,0.5,0.5,-0.5,1.0,1.0,1.0,1.0,1.0,-3.0,2.0,12.5,0.5,-2.0,-2.0,1.0,-0.5,0.5,-1.0,4.0,-2.0,0.5,4.0,1.0,3.0,-10.0,1.0,8.0,1.0,-2.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,-1.0,-3.0,3.5,0.5,-1.0,-3.0,1.0,-1.0,2.0,0.5,-13.0,2.0,2.0,3.0,-0.5,2.0,3.0,-1.0,2.0,20.0,2.0,0.5,-3.0],"gain_curve":[9.0,-5.0,22.0,-13.0,-3.0,1.0,-3.0,9.0,-10.5,13.0,-9.0,1.0,1.0,-5.9,-9.4,0.0,-9.0,5.0,-1.5999999999999996,-9.0,-1.0,5.5,10.5,-1.0,-2.0,2.5,1.5,-1.5,3.5,10.5,-4.0,-1.0,-1.5,2.5,-1.5,-2.5,2.0,0.0,-2.0,-13.0,1.5,3.0,2.0,-0.5,-1.5,7.0,-9.0,0.0,2.5,-3.0,-4.0,-9.0,-3.5,5.0,1.0,-2.5,-1.5,9.0,3.0,4.0,0.0,9.5,0.5,1.0,-1.5,-3.0,-2.0,0.0,3.0,3.5,-5.0,-8.5,0.5,0.0,6.0,1.0,-1.5,0.0,1.5,-3.0,2.5,0.5,-3.0,-3.0,-12.0,10.5,2.5,-10.0,18.0,3.0,0.0,1.0,4.0,0.0,0.5,0.5,-4.0,0.0,2.0,-4.5,0.0,5.0,5.0,-3.0,4.0,0.0,16.5,13.5,-1.5,0.0,2.0,-0.5,2.0,0.0,1.5,0.0,-19.0,-15.0,-4.5,2.0],"cost_curve":[0.016445,0.014986,0.0158385,0.0163655,0.014916,0.0157885,0.014246,0.015746,0.0154785,0.015551,0.016103,0.016313,0.0169285,0.016133,0.016148,0.016088,0.0164755,0.016723,0.015553,0.014551,0.0133875,0.0,0.015806,0.00754,0.0097075,0.0118975,0.019483,0.0127025,0.01804,0.0247825,0.01345,0.0033125,0.0,0.003995,0.00851,0.01384,0.0077075,0.023155,0.011616,0.0082825,0.0,0.0111285,0.008185,0.0,0.0,0.0214555,0.003975,0.00838,0.0127985,0.0086675,0.0140385,0.0139985,0.014481,0.0136885,0.0147035,0.0143385,0.0136035,0.015936,0.0132535,0.0144635,0.012345,0.0156725,0.00663,0.0,0.0029725,0.016415,0.0140735,0.006935,0.014876,0.0,0.015751,0.0070575,0.0108425,0.016515,0.0138335,0.014351,0.0132735,0.0,0.0,0.0116125,0.0,0.007755,0.0031825,0.0175625,0.015251,0.0,0.007565,0.0138035,0.0146435,0.00696,0.0,0.015421,0.0088,0.0162585,0.0136825,0.007285,0.0158385,0.0117675,0.006385,0.0065925,0.0,0.0113225,0.0113775,0.0141835,0.018903,0.0107825,0.026306,0.0,0.0,0.0125875,0.027148,0.0160155,0.007615,0.0075075,0.0,0.013105,0.00309,0.020664,0.0238155,0.00987]},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","run_index":0,"reward":7.7515,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain":3.2688000000000015,"normalized_reward":0.3364518093929124,"normalized_gain":0.43483697604193006,"cost_usd":2.5235525,"latency_seconds":13.774607,"instance_count":12,"reward_curve":[0.3414,0.61,0.6254,0.6318,0.6688,0.5728,0.8049,0.8366,0.7188,0.7447,0.5974,0.5989],"baseline_reward_curve":[0.4402,0.318,0.4004,0.6342,0.4014,0.2813,0.2636,0.3196,0.3603,0.4986,0.3119,0.2532],"gain_curve":[-0.0988,0.292,0.22499999999999998,-0.0023999999999999577,0.26739999999999997,0.2915,0.5412999999999999,0.517,0.3585,0.24610000000000004,0.28550000000000003,0.3457],"cost_curve":[0.170838,0.14766,0.0671055,0.2414325,0.1818395,0.381905,0.21699,0.260785,0.165057,0.405345,0.0751545,0.2094405]},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","run_index":1,"reward":9.556000000000001,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain":5.073300000000002,"normalized_reward":0.6182860355787404,"normalized_gain":0.6748832692589096,"cost_usd":1.9718115,"latency_seconds":16.334104,"instance_count":12,"reward_curve":[0.5495,0.6103,0.8053,0.8334,0.7939,0.8142,0.7908,0.8906,0.8476,0.8546,0.8703,0.8955],"baseline_reward_curve":[0.4402,0.318,0.4004,0.6342,0.4014,0.2813,0.2636,0.3196,0.3603,0.4986,0.3119,0.2532],"gain_curve":[0.10930000000000001,0.29229999999999995,0.40490000000000004,0.19920000000000004,0.39250000000000007,0.5329,0.5271999999999999,0.571,0.4873,0.35600000000000004,0.5584,0.6423],"cost_curve":[0.3032805,0.0991545,0.1370445,0.196822,0.1517725,0.1775075,0.1119135,0.1453455,0.284213,0.115559,0.164839,0.08436]},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","run_index":2,"reward":8.800500000000001,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain":4.317800000000003,"normalized_reward":0.5002889406031833,"normalized_gain":0.5743817594082985,"cost_usd":2.549245,"latency_seconds":9.93876,"instance_count":12,"reward_curve":[0.5298,0.6095,0.7103,0.6825,0.8525,0.7478,0.6957,0.8189,0.7781,0.8316,0.7729,0.7709],"baseline_reward_curve":[0.4402,0.318,0.4004,0.6342,0.4014,0.2813,0.2636,0.3196,0.3603,0.4986,0.3119,0.2532],"gain_curve":[0.08960000000000007,0.29150000000000004,0.30990000000000006,0.04830000000000001,0.45110000000000006,0.4665,0.4321,0.49929999999999997,0.4178,0.333,0.461,0.5177],"cost_curve":[0.2616425,0.2046975,0.2819185,0.3102545,0.089758,0.265113,0.148532,0.158022,0.294385,0.1695385,0.222252,0.1431315]},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","run_index":3,"reward":8.075899999999999,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain":3.5932000000000004,"normalized_reward":0.3871179346213315,"normalized_gain":0.47799076796190115,"cost_usd":2.382286,"latency_seconds":13.698497,"instance_count":12,"reward_curve":[0.4915,0.6194,0.7659,0.6658,0.6741,0.641,0.6483,0.6365,0.6701,0.6696,0.782,0.8117],"baseline_reward_curve":[0.4402,0.318,0.4004,0.6342,0.4014,0.2813,0.2636,0.3196,0.3603,0.4986,0.3119,0.2532],"gain_curve":[0.05130000000000001,0.30139999999999995,0.36550000000000005,0.03159999999999996,0.27270000000000005,0.3597,0.3847,0.31689999999999996,0.3098,0.17099999999999999,0.4701,0.5585],"cost_curve":[0.191555,0.16027,0.1462495,0.212338,0.126209,0.242676,0.0890055,0.255666,0.277626,0.233552,0.285995,0.161144]},{"run_name":"icl-notepad-gpt-5.4","task":"sales_prediction","run_index":4,"reward":8.2436,"baseline_reward":4.482699999999999,"reference_reward":12.0,"gain":3.760900000000002,"normalized_reward":0.41331000983959904,"normalized_gain":0.5002993095925401,"cost_usd":2.334642,"latency_seconds":13.407292,"instance_count":12,"reward_curve":[0.6196,0.592,0.7202,0.7456,0.6888,0.7264,0.6092,0.852,0.65,0.6519,0.693,0.6949],"baseline_reward_curve":[0.4402,0.318,0.4004,0.6342,0.4014,0.2813,0.2636,0.3196,0.3603,0.4986,0.3119,0.2532],"gain_curve":[0.17940000000000006,0.27399999999999997,0.3198,0.11140000000000005,0.2874,0.44510000000000005,0.34559999999999996,0.5324,0.2897,0.15330000000000005,0.38109999999999994,0.4417],"cost_curve":[0.143836,0.150293,0.2925515,0.19011,0.1257305,0.288433,0.2382825,0.151579,0.2299795,0.2498955,0.2180365,0.055915]},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","run_index":0,"reward":37.08659999999998,"baseline_reward":19.7601,"reference_reward":90.0,"gain":17.32649999999998,"normalized_reward":0.24666638192457152,"normalized_gain":0.24667603456155235,"cost_usd":1.1523225,"latency_seconds":5.339353,"instance_count":90,"reward_curve":[0.2203,0.2482,0.2741,0.2722,0.2971,0.3067,0.3008,0.2903,0.3587,0.3566,0.3786,0.3786,0.3566,0.4342,0.3459,0.3459,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.42,0.3566,0.3566,0.3566,0.3566,0.3566,0.3566,0.3708,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.444,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4428,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4756,0.4604,0.4604,0.4604,0.4756],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.0,0.0,0.06240000000000001,0.04580000000000001,0.07299999999999998,0.09389999999999998,0.07350000000000001,0.0953,0.13660000000000003,0.14399999999999996,0.1382,0.15009999999999998,0.13729999999999998,0.18589999999999998,0.15389999999999998,0.1485,0.13269999999999998,0.12959999999999997,0.15009999999999998,0.10919999999999996,0.15479999999999997,0.15469999999999998,0.14359999999999998,0.14829999999999996,0.13219999999999998,0.12329999999999997,0.14719999999999997,0.14609999999999998,0.12539999999999998,0.2128,0.15839999999999999,0.14809999999999998,0.14709999999999998,0.15389999999999998,0.13309999999999997,0.14269999999999997,0.16790000000000002,0.2026,0.2467,0.2237,0.21760000000000002,0.2514,0.2043,0.22240000000000001,0.2167,0.21660000000000001,0.2225,0.2131,0.2107,0.21410000000000004,0.22510000000000002,0.22130000000000002,0.23530000000000004,0.23010000000000003,0.21820000000000003,0.21760000000000002,0.24300000000000002,0.20670000000000002,0.24730000000000002,0.2272,0.20090000000000002,0.23140000000000002,0.259,0.25460000000000005,0.2775,0.2601,0.2484,0.22040000000000004,0.26680000000000004,0.2544,0.22150000000000003,0.26170000000000004,0.22840000000000002,0.24530000000000002,0.2548,0.23790000000000003,0.23340000000000002,0.26270000000000004,0.22680000000000003,0.27590000000000003,0.26770000000000005,0.258,0.259,0.2655,0.25630000000000003,0.2752,0.2608,0.2587,0.21619999999999998,0.2534],"cost_curve":[0.0032775,0.0038325,0.0047525,0.006995,0.0079075,0.008675,0.0103475,0.01066,0.0114575,0.0121475,0.0129375,0.0129625,0.01272,0.013715,0.0138525,0.0142925,0.0137975,0.0139575,0.013965,0.0141975,0.013865,0.014375,0.0135475,0.014015,0.0139325,0.013675,0.01369,0.0134875,0.0135,0.013895,0.013295,0.0126875,0.0125225,0.0122425,0.0121675,0.011985,0.0121775,0.012295,0.0122925,0.0126275,0.01256,0.0123175,0.0126625,0.01254,0.01303,0.012765,0.012905,0.0129575,0.0130225,0.0122425,0.0123825,0.012305,0.012215,0.0123,0.0126075,0.01293,0.0125775,0.01265,0.01262,0.0128625,0.013245,0.01292,0.0134125,0.013465,0.0134725,0.0135825,0.01367,0.0136775,0.0138275,0.0137675,0.0137725,0.014235,0.0142125,0.0142175,0.0141025,0.0142875,0.014155,0.014075,0.014175,0.0141325,0.0142575,0.0142675,0.0143025,0.0139775,0.014775,0.0142425,0.01504,0.0145725,0.0147125,0.0145925]},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","run_index":1,"reward":24.3845,"baseline_reward":19.7601,"reference_reward":90.0,"gain":4.624399999999998,"normalized_reward":0.06582525377639198,"normalized_gain":0.06583722357235698,"cost_usd":1.3047175,"latency_seconds":6.723355,"instance_count":90,"reward_curve":[0.2072,0.2245,0.2494,0.2677,0.2663,0.269,0.2779,0.3257,0.2966,0.3146,0.2809,0.2519,0.211,0.2893,0.2622,0.3098,0.2578,0.2798,0.2455,0.4087,0.2246,0.2583,0.2701,0.2495,0.2136,0.2609,0.2739,0.2009,0.273,0.3333,0.239,0.2494,0.2385,0.3114,0.3011,0.2739,0.2759,0.2341,0.3377,0.3223,0.2701,0.3226,0.2985,0.2969,0.3844,0.3001,0.2703,0.2753,0.3102,0.2929,0.2676,0.1918,0.1782,0.3078,0.2694,0.2876,0.2878,0.2785,0.2582,0.2775,0.2524,0.2478,0.3015,0.2617,0.2992,0.2678,0.2457,0.2143,0.2898,0.2862,0.2937,0.2675,0.258,0.2324,0.273,0.25,0.2904,0.2698,0.2673,0.2479,0.2515,0.2319,0.2845,0.2672,0.2812,0.2259,0.2657,0.2694,0.2763,0.2516],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.0131,-0.0237,0.03770000000000001,0.0413,0.04219999999999999,0.05620000000000003,0.05059999999999998,0.13069999999999998,0.07449999999999998,0.10199999999999998,0.04049999999999998,0.023400000000000004,-0.008300000000000002,0.04100000000000001,0.07019999999999998,0.11240000000000003,0.033899999999999986,0.052799999999999986,0.03900000000000001,0.1613,0.022799999999999987,0.05639999999999998,0.05710000000000001,0.04119999999999999,-0.010799999999999976,0.027600000000000013,0.06449999999999997,-0.009599999999999997,0.04180000000000003,0.1261,0.0408,0.04090000000000002,0.028999999999999998,0.10870000000000002,0.07759999999999997,0.05999999999999997,0.07299999999999998,-0.007300000000000001,0.1404,0.10199999999999998,0.04370000000000002,0.13,0.05879999999999999,0.0753,0.15710000000000002,0.07269999999999999,0.04879999999999998,0.044399999999999995,0.07689999999999997,0.06420000000000001,0.0499,-0.029700000000000004,-0.029299999999999993,0.09510000000000002,0.04479999999999998,0.06240000000000001,0.088,0.04240000000000002,0.06269999999999998,0.06190000000000001,0.01050000000000001,0.03639999999999999,0.0849,0.040699999999999986,0.10110000000000002,0.052299999999999985,0.01849999999999999,-0.04089999999999999,0.08099999999999999,0.065,0.039600000000000024,0.05360000000000001,0.010800000000000004,0.0020999999999999908,0.052200000000000024,0.012300000000000005,0.04819999999999999,0.05689999999999998,0.01849999999999999,0.04820000000000002,0.0436,0.014300000000000007,0.06789999999999999,0.057099999999999984,0.06190000000000001,0.025499999999999995,0.06609999999999999,0.06769999999999998,0.03209999999999999,0.02939999999999998],"cost_curve":[0.0049125,0.0066375,0.00773,0.00925,0.0098375,0.01037,0.0111175,0.0120825,0.013195,0.0138025,0.0149825,0.015395,0.0153075,0.01526,0.0158325,0.0156175,0.0154925,0.01529,0.0165975,0.016455,0.017155,0.01755,0.0176675,0.01726,0.0171275,0.0169775,0.0172925,0.01661,0.01657,0.0163525,0.01629,0.0159075,0.0157975,0.012635,0.0146225,0.01525,0.01392,0.0147075,0.012695,0.0115875,0.0145975,0.01123,0.0143225,0.01502,0.0128375,0.013705,0.0113925,0.01159,0.0118975,0.01209,0.0119225,0.01176,0.0115525,0.012675,0.0133,0.013065,0.013435,0.01368,0.0141275,0.0138775,0.014275,0.0143125,0.014845,0.0146925,0.0149025,0.0150875,0.015515,0.01568,0.0158925,0.0163675,0.01607,0.0159925,0.0161625,0.0166675,0.0163575,0.0163625,0.0163125,0.0166225,0.01636,0.0161575,0.016245,0.0162625,0.0162975,0.0163125,0.0163,0.01658,0.0162875,0.0160225,0.016285,0.0162725]},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","run_index":2,"reward":41.496399999999944,"baseline_reward":19.7601,"reference_reward":90.0,"gain":21.736299999999943,"normalized_reward":0.30944916641751646,"normalized_gain":0.30945801460423406,"cost_usd":1.450388,"latency_seconds":5.772129,"instance_count":90,"reward_curve":[0.2482,0.2752,0.312,0.2938,0.3419,0.3404,0.4059,0.4256,0.3995,0.4164,0.478,0.488,0.4827,0.4857,0.485,0.5101,0.488,0.4672,0.488,0.4504,0.4661,0.4504,0.4661,0.4661,0.4622,0.4732,0.4753,0.4737,0.4753,0.4732,0.4753,0.4732,0.4753,0.4753,0.4741,0.4753,0.4753,0.4753,0.4753,0.4777,0.4777,0.4777,0.4753,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.3758,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777,0.4777],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.027900000000000008,0.026999999999999996,0.1003,0.06740000000000002,0.11779999999999999,0.1276,0.17859999999999998,0.23059999999999997,0.17740000000000003,0.20379999999999998,0.23759999999999998,0.25949999999999995,0.2634,0.23740000000000003,0.293,0.3127,0.2641,0.2402,0.2815,0.203,0.2643,0.24850000000000003,0.2531,0.25780000000000003,0.2378,0.2399,0.2659,0.2632,0.2441,0.266,0.2771,0.26470000000000005,0.26580000000000004,0.2726,0.25060000000000004,0.26139999999999997,0.2724,0.2339,0.278,0.2574,0.2513,0.2851,0.2356,0.2561,0.2504,0.2503,0.2562,0.24680000000000002,0.2444,0.24900000000000003,0.26,0.2562,0.2702,0.265,0.2531,0.2525,0.27790000000000004,0.2416,0.2822,0.2621,0.2358,0.2663,0.2611,0.25670000000000004,0.2796,0.2622,0.2505,0.22250000000000003,0.26890000000000003,0.2565,0.22360000000000002,0.26380000000000003,0.2305,0.2474,0.2569,0.24000000000000002,0.23550000000000001,0.26480000000000004,0.12700000000000003,0.278,0.26980000000000004,0.2601,0.2611,0.2676,0.2584,0.2773,0.2781,0.276,0.2335,0.2555],"cost_curve":[0.0038325,0.00496,0.0066875,0.007955,0.009875,0.010655,0.0116875,0.0130125,0.0130175,0.0131625,0.01325,0.0134925,0.01359,0.014475,0.01405,0.014295,0.0143375,0.01347,0.0137325,0.0137475,0.013635,0.015855,0.0135025,0.01363,0.013565,0.017305,0.014155,0.0149625,0.01738,0.0158325,0.0170025,0.01426,0.0182275,0.0189525,0.015775,0.018575,0.0185875,0.0190775,0.01908,0.0156,0.0186175,0.0176975,0.01883,0.0178,0.01585,0.0190325,0.018205,0.0179575,0.01582,0.014565,0.0190625,0.01896,0.019465,0.0174,0.0167975,0.0150855,0.0194025,0.0179675,0.0184575,0.014705,0.0194575,0.0194325,0.0197225,0.0159425,0.0146075,0.0193175,0.0192075,0.01715,0.0192825,0.01625,0.0197225,0.01672,0.0163325,0.01859,0.0165375,0.01858,0.01955,0.0190475,0.0194225,0.0149225,0.0188975,0.0195025,0.019405,0.0162125,0.0164375,0.0191125,0.0195875,0.018105,0.017985,0.015425]},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","run_index":3,"reward":29.96869999999998,"baseline_reward":19.7601,"reference_reward":90.0,"gain":10.20859999999998,"normalized_reward":0.1453280940787878,"normalized_gain":0.14533904518656743,"cost_usd":1.4488005,"latency_seconds":5.415662,"instance_count":90,"reward_curve":[0.192,0.2465,0.2167,0.2361,0.2292,0.2361,0.2449,0.2894,0.2894,0.2894,0.2915,0.2927,0.2817,0.2927,0.2864,0.2927,0.2927,0.309,0.298,0.3034,0.2864,0.3058,0.2927,0.2927,0.2927,0.2927,0.3586,0.2864,0.3496,0.3381,0.309,0.2864,0.2864,0.2707,0.2927,0.2864,0.2927,0.3599,0.2924,0.3355,0.3586,0.3606,0.3394,0.3355,0.3034,0.3586,0.3562,0.3562,0.3562,0.3562,0.3562,0.3562,0.3562,0.3562,0.3562,0.3562,0.3718,0.3562,0.3562,0.3562,0.3725,0.3642,0.3562,0.3562,0.3877,0.3562,0.3831,0.3562,0.3562,0.3188,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.4183,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877,0.3877],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[-0.028299999999999992,-0.001700000000000007,0.0050000000000000044,0.009700000000000014,0.005099999999999993,0.023300000000000015,0.017600000000000005,0.09439999999999998,0.0673,0.07679999999999998,0.05109999999999998,0.06420000000000001,0.06240000000000001,0.04440000000000002,0.09439999999999998,0.09530000000000002,0.06880000000000003,0.08199999999999999,0.0915,0.055999999999999994,0.08459999999999998,0.10390000000000002,0.07970000000000002,0.0844,0.06830000000000003,0.05940000000000001,0.14919999999999997,0.0759,0.11840000000000003,0.13090000000000002,0.11080000000000001,0.0779,0.0769,0.068,0.06920000000000001,0.07249999999999998,0.08980000000000002,0.1185,0.09509999999999999,0.11520000000000002,0.13219999999999998,0.16799999999999998,0.09969999999999998,0.11390000000000003,0.0761,0.13119999999999998,0.13470000000000001,0.12530000000000002,0.12290000000000001,0.12750000000000003,0.1385,0.13470000000000001,0.14870000000000003,0.14350000000000002,0.13160000000000002,0.131,0.17200000000000001,0.12010000000000001,0.1607,0.1406,0.1306,0.15280000000000002,0.13960000000000003,0.13520000000000001,0.1896,0.14070000000000002,0.15589999999999998,0.10100000000000003,0.1474,0.09759999999999996,0.1336,0.17379999999999998,0.14049999999999999,0.15739999999999998,0.1669,0.15,0.1455,0.2054,0.1389,0.188,0.1798,0.1701,0.1711,0.17759999999999998,0.1684,0.1873,0.1881,0.186,0.1435,0.16549999999999998],"cost_curve":[0.0043475,0.0054825,0.0070325,0.0079575,0.0087675,0.009355,0.0103325,0.01095,0.0115025,0.0117275,0.01172,0.012265,0.0133375,0.0127375,0.013705,0.01346,0.01358,0.01455,0.0159425,0.0152325,0.0153825,0.015425,0.01497,0.0150775,0.015105,0.0151275,0.016235,0.01592,0.0158725,0.0161225,0.0159975,0.0161625,0.0158475,0.0161475,0.015905,0.015875,0.015485,0.0167825,0.0165425,0.0163675,0.016405,0.01674,0.0166,0.0167475,0.016805,0.01662,0.0170125,0.016965,0.017105,0.0170175,0.01761,0.0174125,0.0175475,0.018255,0.0176325,0.0175875,0.0178325,0.0179775,0.01762,0.0178,0.0181075,0.01856,0.0182825,0.018245,0.0184125,0.018185,0.018725,0.01864,0.01838,0.0185725,0.0188175,0.01885,0.019295,0.0188525,0.0189025,0.0189625,0.019395,0.01878,0.01913,0.0188775,0.0193,0.019145,0.019035,0.0193825,0.0153405,0.01966,0.0193625,0.019375,0.01923,0.01937]},{"run_name":"mem0-gpt-5.4","task":"blind_spectrum_monitoring","run_index":4,"reward":36.03140000000001,"baseline_reward":19.7601,"reference_reward":90.0,"gain":16.27130000000001,"normalized_reward":0.231643389000413,"normalized_gain":0.23165323413045874,"cost_usd":1.5891435,"latency_seconds":5.459746,"instance_count":90,"reward_curve":[0.2273,0.2631,0.3218,0.317,0.3253,0.3859,0.3978,0.3986,0.3973,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4052,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4227,0.4012,0.4232,0.4012,0.4012,0.4356,0.4356,0.4012,0.4356,0.4012,0.4356,0.4356,0.4356,0.4356,0.4356,0.4012,0.4356,0.4012,0.4012,0.4012,0.4356,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4351,0.4389,0.4382,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012,0.4012],"baseline_reward_curve":[0.2203,0.2482,0.2117,0.2264,0.2241,0.2128,0.2273,0.195,0.2221,0.2126,0.2404,0.2285,0.2193,0.2483,0.192,0.1974,0.2239,0.227,0.2065,0.2474,0.2018,0.2019,0.213,0.2083,0.2244,0.2333,0.2094,0.2105,0.2312,0.2072,0.1982,0.2085,0.2095,0.2027,0.2235,0.2139,0.2029,0.2414,0.1973,0.2203,0.2264,0.1926,0.2397,0.2216,0.2273,0.2274,0.2215,0.2309,0.2333,0.2287,0.2177,0.2215,0.2075,0.2127,0.2246,0.2252,0.1998,0.2361,0.1955,0.2156,0.2419,0.2114,0.2166,0.221,0.1981,0.2155,0.2272,0.2552,0.2088,0.2212,0.2541,0.2139,0.2472,0.2303,0.2208,0.2377,0.2422,0.2129,0.2488,0.1997,0.2079,0.2176,0.2166,0.2101,0.2193,0.2004,0.1996,0.2017,0.2442,0.2222],"gain_curve":[0.007000000000000006,0.014899999999999997,0.11009999999999998,0.09060000000000001,0.10119999999999998,0.17310000000000003,0.17049999999999998,0.2036,0.1752,0.1886,0.1608,0.1727,0.1819,0.1529,0.2092,0.2038,0.1773,0.1742,0.1947,0.1538,0.2034,0.1993,0.1882,0.1929,0.1768,0.1679,0.1918,0.1907,0.17,0.194,0.203,0.1927,0.21320000000000003,0.1985,0.19970000000000002,0.1873,0.1983,0.19419999999999998,0.23829999999999998,0.1809,0.2092,0.2086,0.1959,0.214,0.20829999999999999,0.2082,0.21409999999999998,0.1703,0.20229999999999998,0.17250000000000001,0.1835,0.1797,0.2281,0.1885,0.1766,0.176,0.2014,0.1651,0.2057,0.1856,0.1593,0.1898,0.2185,0.2179,0.24009999999999998,0.1857,0.174,0.14600000000000002,0.1924,0.18,0.1471,0.1873,0.154,0.1709,0.1804,0.1635,0.159,0.1883,0.1524,0.2015,0.1933,0.1836,0.18460000000000001,0.1911,0.1819,0.2008,0.2016,0.1995,0.157,0.179],"cost_curve":[0.004385,0.0055075,0.0077,0.0102,0.0111175,0.0120175,0.0127125,0.012495,0.0130325,0.013865,0.0135925,0.0141,0.0145625,0.0137725,0.015315,0.0160475,0.016215,0.0166775,0.01712,0.01691,0.017075,0.01846,0.01577,0.0139575,0.0182825,0.01905,0.019205,0.0186025,0.019205,0.0184425,0.0192075,0.018725,0.01988,0.0191975,0.0196025,0.0194725,0.0192025,0.0199,0.0194025,0.019335,0.0196425,0.01976,0.019765,0.0197875,0.0197925,0.0199675,0.01979,0.0192,0.01978,0.018295,0.0186,0.0191875,0.019205,0.019205,0.01932,0.0191925,0.01962,0.019355,0.0191925,0.01545,0.019315,0.019335,0.0196,0.0194475,0.0157005,0.019335,0.019625,0.0151755,0.01935,0.0193375,0.0186025,0.01934,0.019595,0.0193475,0.01947,0.0193175,0.0193425,0.0196175,0.0196625,0.01853,0.019455,0.019225,0.019525,0.0195825,0.01959,0.0196675,0.01954,0.02013,0.0201675,0.0197875]},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","run_index":0,"reward":10.749999999999998,"baseline_reward":8.125,"reference_reward":19.0,"gain":2.6249999999999982,"normalized_reward":0.13612565445026165,"normalized_gain":0.24137931034482743,"cost_usd":2.5361395,"latency_seconds":4.880903,"instance_count":19,"reward_curve":[0.85,0.9,0.8,0.0,0.9,0.675,0.825,0.6,0.825,0.875,0.0,0.825,0.95,0.825,0.0,0.0,0.0,0.0,0.9],"baseline_reward_curve":[0.825,0.0,0.8,0.0,0.575,0.525,0.625,0.75,0.35,0.75,0.0,0.7,0.875,0.775,0.0,0.575,0.0,0.0,0.0],"gain_curve":[0.025000000000000022,0.9,0.0,0.0,0.32500000000000007,0.15000000000000002,0.19999999999999996,-0.15000000000000002,0.475,0.125,0.0,0.125,0.07499999999999996,0.04999999999999993,0.0,-0.575,0.0,0.0,0.9],"cost_curve":[0.047614,0.0596745,0.196187,0.2386235,0.0425195,0.2830475,0.165064,0.370908,0.128444,0.0955955,0.0572615,0.134889,0.035348,0.097978,0.199581,0.1091135,0.083809,0.129353,0.061129]},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","run_index":1,"reward":13.375,"baseline_reward":8.125,"reference_reward":19.0,"gain":5.25,"normalized_reward":0.4109947643979058,"normalized_gain":0.4827586206896552,"cost_usd":2.870252,"latency_seconds":4.785827,"instance_count":19,"reward_curve":[0.65,0.625,0.775,0.875,0.85,0.85,0.0,0.9,0.0,0.65,0.925,0.6,0.675,0.65,0.925,0.85,0.875,0.875,0.825],"baseline_reward_curve":[0.825,0.0,0.8,0.0,0.575,0.525,0.625,0.75,0.35,0.75,0.0,0.7,0.875,0.775,0.0,0.575,0.0,0.0,0.0],"gain_curve":[-0.17499999999999993,0.625,-0.025000000000000022,0.875,0.275,0.32499999999999996,-0.625,0.15000000000000002,-0.35,-0.09999999999999998,0.925,-0.09999999999999998,-0.19999999999999996,-0.125,0.925,0.275,0.875,0.875,0.825],"cost_curve":[0.306426,0.2800575,0.177008,0.0654505,0.0983585,0.0899795,0.137783,0.0533705,0.150241,0.2973345,0.045199,0.311771,0.190754,0.2953595,0.041729,0.0860045,0.0826995,0.070787,0.0899395]},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","run_index":2,"reward":7.45,"baseline_reward":8.125,"reference_reward":19.0,"gain":-0.6749999999999998,"normalized_reward":-0.2094240837696334,"normalized_gain":-0.062068965517241365,"cost_usd":3.0635685,"latency_seconds":5.918777,"instance_count":19,"reward_curve":[0.775,0.0,0.0,0.925,0.0,0.0,0.0,0.85,0.9,0.0,0.725,0.0,0.875,0.625,0.9,0.0,0.875,0.0,0.0],"baseline_reward_curve":[0.825,0.0,0.8,0.0,0.575,0.525,0.625,0.75,0.35,0.75,0.0,0.7,0.875,0.775,0.0,0.575,0.0,0.0,0.0],"gain_curve":[-0.04999999999999993,0.0,-0.8,0.925,-0.575,-0.525,-0.625,0.09999999999999998,0.55,-0.75,0.725,-0.7,0.0,-0.15000000000000002,0.9,-0.575,0.875,0.0,0.0],"cost_curve":[0.097151,0.0852715,0.04941,0.04413,0.322272,0.082381,0.072733,0.083462,0.0675675,0.103554,0.2323745,0.092938,0.0699655,0.258258,0.0638465,0.0561405,0.084307,1.1120605,0.085746]},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","run_index":3,"reward":10.549999999999999,"baseline_reward":8.125,"reference_reward":19.0,"gain":2.424999999999999,"normalized_reward":0.11518324607329838,"normalized_gain":0.22298850574712634,"cost_usd":2.5240195,"latency_seconds":5.550761,"instance_count":19,"reward_curve":[0.75,0.775,0.9,0.0,0.825,0.925,0.825,0.9,0.9,0.0,0.775,0.95,0.0,0.0,0.725,0.6,0.7,0.0,0.0],"baseline_reward_curve":[0.825,0.0,0.8,0.0,0.575,0.525,0.625,0.75,0.35,0.75,0.0,0.7,0.875,0.775,0.0,0.575,0.0,0.0,0.0],"gain_curve":[-0.07499999999999996,0.775,0.09999999999999998,0.0,0.25,0.4,0.19999999999999996,0.15000000000000002,0.55,-0.75,0.775,0.25,-0.875,-0.775,0.725,0.025000000000000022,0.7,0.0,0.0],"cost_curve":[0.1601165,0.1902055,0.0604335,0.1042345,0.1819565,0.0437045,0.1424385,0.066722,0.058104,0.078061,0.1149835,0.033371,0.17743,0.1643445,0.205924,0.354246,0.204595,0.0793785,0.1037705]},{"run_name":"mem0-gpt-5.4","task":"codebase_adaptation","run_index":4,"reward":13.399999999999999,"baseline_reward":8.125,"reference_reward":19.0,"gain":5.274999999999999,"normalized_reward":0.41361256544502606,"normalized_gain":0.4850574712643677,"cost_usd":2.1268815,"latency_seconds":4.752591,"instance_count":19,"reward_curve":[0.775,0.85,0.85,0.775,0.0,0.9,0.825,0.9,0.9,0.925,0.75,0.85,0.0,0.825,0.775,0.95,0.85,0.7,0.0],"baseline_reward_curve":[0.825,0.0,0.8,0.0,0.575,0.525,0.625,0.75,0.35,0.75,0.0,0.7,0.875,0.775,0.0,0.575,0.0,0.0,0.0],"gain_curve":[-0.04999999999999993,0.85,0.04999999999999993,0.775,-0.575,0.375,0.19999999999999996,0.15000000000000002,0.55,0.17500000000000004,0.75,0.15000000000000002,-0.875,0.04999999999999993,0.775,0.375,0.85,0.7,0.0],"cost_curve":[0.085823,0.132917,0.130518,0.2597995,0.0650265,0.0600195,0.107644,0.074266,0.060501,0.035846,0.13314,0.0828075,0.1421585,0.1232615,0.1683645,0.028324,0.1090705,0.2749455,0.052449]},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","run_index":0,"reward":0.2364,"baseline_reward":0.8448,"reference_reward":3.24404,"gain":-0.6084,"normalized_reward":-0.33700223156734266,"normalized_gain":-0.2535803004284691,"cost_usd":6.905268,"latency_seconds":7.591437,"instance_count":20,"reward_curve":[0.0,0.0632,0.0502,0.0776,0.0359,0.0,0.0095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0469,0.0,0.0,0.1062,0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1893,0.0,0.0,0.0,0.0717,0.2775],"gain_curve":[0.0,0.0632,0.0033000000000000043,0.0776,0.0359,-0.1062,-0.1437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1893,0.0,0.0,0.0,-0.0717,-0.2775],"cost_curve":[0.4573905,0.4307115,0.53752,0.3220715,0.4471905,0.4421855,0.59586,0.3828565,0.40011,0.412731,0.231949,0.4479165,0.3478905,0.3248365,0.1987205,0.225653,0.181783,0.1641885,0.166192,0.1875115]},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","run_index":1,"reward":0.6536,"baseline_reward":0.8448,"reference_reward":3.24404,"gain":-0.19120000000000004,"normalized_reward":-0.1515420930501347,"normalized_gain":-0.07969190243577134,"cost_usd":4.6131225,"latency_seconds":9.296847,"instance_count":20,"reward_curve":[0.0161,0.0,0.1687,0.0,0.0,0.0,0.0923,0.0,0.0,0.0348,0.0176,0.0,0.0,0.0,0.0,0.0,0.0309,0.014,0.0,0.2792],"baseline_reward_curve":[0.0,0.0,0.0469,0.0,0.0,0.1062,0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1893,0.0,0.0,0.0,0.0717,0.2775],"gain_curve":[0.0161,0.0,0.12179999999999999,0.0,0.0,-0.1062,-0.06090000000000001,0.0,0.0,0.0348,0.0176,0.0,0.0,0.0,-0.1893,0.0,0.0309,0.014,-0.0717,0.0016999999999999793],"cost_curve":[0.437335,0.410166,0.237683,0.2413545,0.413106,0.2053835,0.29439,0.1702355,0.2072435,0.2456685,0.1537475,0.1955905,0.1596025,0.1171675,0.1893275,0.1736235,0.17665,0.1863005,0.19031,0.2082375]},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","run_index":2,"reward":1.8221000000000003,"baseline_reward":0.8448,"reference_reward":3.24404,"gain":0.9773000000000003,"normalized_reward":0.36789743680930337,"normalized_gain":0.4073373234857706,"cost_usd":4.8993035,"latency_seconds":8.951984,"instance_count":20,"reward_curve":[0.1711,0.0,0.1145,0.1341,0.1081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1901,0.003,0.2685,0.2381,0.2511,0.0,0.0,0.3435],"baseline_reward_curve":[0.0,0.0,0.0469,0.0,0.0,0.1062,0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1893,0.0,0.0,0.0,0.0717,0.2775],"gain_curve":[0.1711,0.0,0.06760000000000001,0.1341,0.1081,-0.1062,-0.1532,0.0,0.0,0.0,0.0,0.0,0.1901,0.003,0.07920000000000002,0.2381,0.2511,0.0,-0.0717,0.066],"cost_curve":[0.241294,0.2818845,0.2359805,0.1883925,0.2965165,0.2352055,0.3081485,0.223972,0.2281735,0.279094,0.267004,0.255454,0.254535,0.2376125,0.2230655,0.2309295,0.237277,0.1841115,0.253581,0.237072]},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","run_index":3,"reward":0.0811,"baseline_reward":0.8448,"reference_reward":3.24404,"gain":-0.7637,"normalized_reward":-0.4060385678849898,"normalized_gain":-0.3183091312248879,"cost_usd":7.4413995,"latency_seconds":11.795782,"instance_count":20,"reward_curve":[0.0,0.0811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0469,0.0,0.0,0.1062,0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1893,0.0,0.0,0.0,0.0717,0.2775],"gain_curve":[0.0,0.0811,-0.0469,0.0,0.0,-0.1062,-0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1893,0.0,0.0,0.0,-0.0717,-0.2775],"cost_curve":[0.292203,0.345004,0.3887675,0.3092055,0.458389,0.303688,0.4036735,0.4311485,0.184307,0.3427915,0.4782375,0.421053,0.413473,0.49249,0.391038,0.294988,0.344436,0.3552675,0.410007,0.381232]},{"run_name":"mem0-gpt-5.4","task":"cohort_studies","run_index":4,"reward":1.0977999999999999,"baseline_reward":0.8448,"reference_reward":3.24404,"gain":0.2529999999999999,"normalized_reward":0.04592049930207951,"normalized_gain":0.10545005918540867,"cost_usd":6.134443,"latency_seconds":7.758722,"instance_count":20,"reward_curve":[0.0146,0.0356,0.1996,0.033,0.0466,0.0,0.0,0.1446,0.0,0.0,0.0,0.0,0.3349,0.2808,0.0,0.0,0.0,0.0,0.0081,0.0],"baseline_reward_curve":[0.0,0.0,0.0469,0.0,0.0,0.1062,0.1532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1893,0.0,0.0,0.0,0.0717,0.2775],"gain_curve":[0.0146,0.0356,0.1527,0.033,0.0466,-0.1062,-0.1532,0.1446,0.0,0.0,0.0,0.0,0.3349,0.2808,-0.1893,0.0,0.0,0.0,-0.0636,-0.2775],"cost_curve":[0.5008385,0.553656,0.486978,0.444152,0.3860765,0.275218,0.281458,0.1917675,0.2887205,0.240694,0.2342225,0.288532,0.2510645,0.2338875,0.1994015,0.2717435,0.2780225,0.236413,0.252686,0.238911]},{"run_name":"mem0-gpt-5.4","task":"database_exploration","run_index":0,"reward":16.8,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain":12.466666666666669,"normalized_reward":0.3268858800773695,"normalized_gain":0.3495327102803739,"cost_usd":2.090264,"latency_seconds":2.911076,"instance_count":40,"reward_curve":[0.0,0.0,0.0,0.8666666666666667,0.0,0.8666666666666667,0.8,0.0,0.7333333333333334,0.8,0.6,0.6666666666666667,0.0,0.4666666666666667,0.0,0.8,0.9333333333333333,0.0,0.6666666666666667,0.6,0.0,0.0,0.0,0.0,0.6666666666666667,0.0,0.0,0.9333333333333333,0.0,0.7333333333333334,0.0,0.9333333333333333,0.8666666666666667,0.0,0.7333333333333334,0.6666666666666667,0.8,0.9333333333333333,0.7333333333333334,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.4,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.7333333333333334,0.0,0.6,0.0],"gain_curve":[0.0,0.0,0.0,0.5333333333333333,0.0,0.8666666666666667,0.2666666666666667,0.0,0.20000000000000007,0.8,0.19999999999999996,0.6666666666666667,0.0,0.06666666666666665,0.0,0.8,0.9333333333333333,0.0,0.6666666666666667,0.6,0.0,0.0,0.0,-0.4666666666666667,0.6666666666666667,0.0,0.0,0.9333333333333333,0.0,0.7333333333333334,0.0,0.9333333333333333,0.8666666666666667,0.0,0.7333333333333334,0.33333333333333337,0.06666666666666665,0.9333333333333333,0.13333333333333341,0.0],"cost_curve":[0.0756605,0.0525425,0.0713075,0.029145,0.014865,0.028515,0.0470075,0.0467425,0.0640335,0.03761,0.097424,0.063258,0.0467275,0.1280815,0.03383,0.0413395,0.015,0.0137975,0.0533765,0.0982595,0.0157875,0.0168775,0.0147025,0.062197,0.0582795,0.086271,0.06149,0.0154325,0.10021,0.059884,0.0828605,0.0145875,0.030475,0.10608,0.0595385,0.065385,0.0298475,0.016575,0.0451195,0.090141]},{"run_name":"mem0-gpt-5.4","task":"database_exploration","run_index":1,"reward":17.26666666666667,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain":12.933333333333337,"normalized_reward":0.3404255319148937,"normalized_gain":0.36261682242990667,"cost_usd":2.281882,"latency_seconds":3.108703,"instance_count":40,"reward_curve":[0.4,0.8,0.7333333333333334,0.8,0.0,0.5333333333333333,0.0,0.7333333333333334,0.0,0.8,0.8666666666666667,0.0,0.6,0.6,0.4666666666666667,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.5333333333333333,0.0,0.8,0.8666666666666667,0.0,0.0,0.0,0.8666666666666667,0.6666666666666667,0.0,0.9333333333333333,0.0,0.0,0.8,0.0,0.0,0.9333333333333333,0.0,0.8],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.4,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.7333333333333334,0.0,0.6,0.0],"gain_curve":[0.4,0.8,0.7333333333333334,0.4666666666666667,0.0,0.5333333333333333,-0.5333333333333333,0.7333333333333334,-0.5333333333333333,0.8,0.4666666666666667,0.0,0.6,0.19999999999999996,0.4666666666666667,0.9333333333333333,0.8666666666666667,0.0,0.9333333333333333,0.0,0.0,0.5333333333333333,0.0,0.33333333333333337,0.8666666666666667,0.0,0.0,0.0,0.8666666666666667,0.6666666666666667,0.0,0.9333333333333333,0.0,0.0,0.8,-0.33333333333333337,-0.7333333333333334,0.9333333333333333,-0.6,0.8],"cost_curve":[0.107573,0.03132,0.0604375,0.0346325,0.044305,0.1187105,0.0700775,0.0538755,0.0399265,0.040597,0.029535,0.028105,0.1149205,0.1175965,0.1186695,0.0152125,0.0300875,0.047524,0.01506,0.03196,0.0158025,0.0989695,0.119138,0.0387735,0.02975,0.0899145,0.040152,0.1268065,0.0345575,0.061133,0.014785,0.0169925,0.08362,0.045126,0.0409245,0.126561,0.0894315,0.0147525,0.0150325,0.029534]},{"run_name":"mem0-gpt-5.4","task":"database_exploration","run_index":2,"reward":18.400000000000002,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain":14.06666666666667,"normalized_reward":0.3733075435203096,"normalized_gain":0.3943925233644861,"cost_usd":1.6868875,"latency_seconds":2.966648,"instance_count":40,"reward_curve":[0.0,0.0,0.7333333333333334,0.9333333333333333,0.8,0.6666666666666667,0.9333333333333333,0.0,0.0,0.8666666666666667,0.9333333333333333,0.6,0.0,0.0,0.8,0.6666666666666667,0.0,0.8,0.6,0.6666666666666667,0.8666666666666667,0.7333333333333334,0.8666666666666667,0.7333333333333334,0.0,0.0,0.4666666666666667,0.7333333333333334,0.8,0.0,0.6,0.8,0.0,0.0,0.0,0.8666666666666667,0.0,0.0,0.0,0.9333333333333333],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.4,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.7333333333333334,0.0,0.6,0.0],"gain_curve":[0.0,0.0,0.7333333333333334,0.6,0.8,0.6666666666666667,0.4,0.0,-0.5333333333333333,0.8666666666666667,0.5333333333333333,0.6,0.0,-0.4,0.8,0.6666666666666667,0.0,0.8,0.6,0.6666666666666667,0.8666666666666667,0.7333333333333334,0.8666666666666667,0.2666666666666667,0.0,0.0,0.4666666666666667,0.7333333333333334,0.8,0.0,0.6,0.8,0.0,0.0,0.0,0.5333333333333333,-0.7333333333333334,0.0,-0.6,0.9333333333333333],"cost_curve":[0.020345,0.041134,0.0408945,0.01313,0.04715,0.0694825,0.0144625,0.040114,0.0098115,0.029115,0.0145625,0.0825645,0.0159225,0.0477325,0.0396615,0.061285,0.01403,0.036858,0.092674,0.0649075,0.0299025,0.046501,0.02768,0.051663,0.037175,0.0147075,0.1015935,0.064162,0.0400095,0.01515,0.094945,0.041712,0.0762265,0.0147875,0.0539365,0.01865,0.0148125,0.116029,0.016445,0.0149625]},{"run_name":"mem0-gpt-5.4","task":"database_exploration","run_index":3,"reward":12.866666666666669,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain":8.533333333333335,"normalized_reward":0.2127659574468086,"normalized_gain":0.2392523364485982,"cost_usd":1.8116305,"latency_seconds":3.078039,"instance_count":40,"reward_curve":[0.0,0.7333333333333334,0.0,0.0,0.0,0.7333333333333334,0.9333333333333333,0.9333333333333333,0.0,0.8,0.6,0.8,0.8,0.0,0.0,0.6,0.8666666666666667,0.8,0.8,0.0,0.0,0.0,0.0,0.0,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8666666666666667,0.0,0.0,0.0,0.0,0.8666666666666667,0.0,0.0],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.4,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.7333333333333334,0.0,0.6,0.0],"gain_curve":[0.0,0.7333333333333334,0.0,-0.33333333333333337,0.0,0.7333333333333334,0.4,0.9333333333333333,-0.5333333333333333,0.8,0.19999999999999996,0.8,0.8,-0.4,0.0,0.6,0.8666666666666667,0.8,0.8,0.0,0.0,0.0,0.0,-0.4666666666666667,0.9333333333333333,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8666666666666667,0.0,0.0,-0.33333333333333337,-0.7333333333333334,0.8666666666666667,-0.6,0.0],"cost_curve":[0.069061,0.0459425,0.030213,0.0284275,0.0402325,0.053047,0.013685,0.0139225,0.028075,0.0447375,0.1011235,0.0330915,0.0416275,0.041305,0.0351725,0.1223725,0.02724,0.036588,0.0451675,0.0478325,0.123783,0.0628745,0.0144025,0.01419,0.0139775,0.1029895,0.087607,0.0293475,0.0965535,0.089518,0.015,0.043669,0.035845,0.01447,0.0140875,0.0608565,0.026538,0.022274,0.0295735,0.01521]},{"run_name":"mem0-gpt-5.4","task":"database_exploration","run_index":4,"reward":20.866666666666674,"baseline_reward":4.333333333333333,"reference_reward":40.0,"gain":16.533333333333342,"normalized_reward":0.444874274661509,"normalized_gain":0.46355140186915916,"cost_usd":1.9742495,"latency_seconds":3.024262,"instance_count":40,"reward_curve":[0.6,0.0,0.6,0.0,0.9333333333333333,0.7333333333333334,0.0,0.6666666666666667,0.0,0.0,0.8,0.0,0.0,0.0,0.4666666666666667,0.8,0.9333333333333333,0.0,0.0,0.6,0.6666666666666667,0.0,0.7333333333333334,0.8666666666666667,0.8,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8,0.0,0.9333333333333333,0.0,0.8666666666666667,0.5333333333333333,0.8,0.7333333333333334,0.8666666666666667,0.7333333333333334,0.9333333333333333,0.7333333333333334],"baseline_reward_curve":[0.0,0.0,0.0,0.33333333333333337,0.0,0.0,0.5333333333333333,0.0,0.5333333333333333,0.0,0.4,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4666666666666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33333333333333337,0.7333333333333334,0.0,0.6,0.0],"gain_curve":[0.6,0.0,0.6,-0.33333333333333337,0.9333333333333333,0.7333333333333334,-0.5333333333333333,0.6666666666666667,-0.5333333333333333,0.0,0.4,0.0,0.0,-0.4,0.4666666666666667,0.8,0.9333333333333333,0.0,0.0,0.6,0.6666666666666667,0.0,0.7333333333333334,0.4,0.8,0.8666666666666667,0.9333333333333333,0.9333333333333333,0.8,0.0,0.9333333333333333,0.0,0.8666666666666667,0.5333333333333333,0.8,0.4,0.1333333333333333,0.7333333333333334,0.33333333333333337,0.7333333333333334],"cost_curve":[0.037251,0.084758,0.110005,0.0144875,0.0139975,0.0490265,0.053645,0.0833745,0.0917875,0.010309,0.0492825,0.01291,0.0945105,0.0567355,0.1651375,0.0479,0.015115,0.038014,0.069164,0.085528,0.061832,0.04968,0.0579665,0.028665,0.042776,0.0304925,0.0136875,0.0145925,0.0453975,0.01366,0.011858,0.0670325,0.0233975,0.099502,0.044005,0.049682,0.0242905,0.0447315,0.0145425,0.05352]},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","run_index":0,"reward":66.19999999999999,"baseline_reward":90.4,"reference_reward":1138.5,"gain":-24.200000000000017,"normalized_reward":-0.0670713503831227,"normalized_gain":-0.023089399866424977,"cost_usd":3.76374,"latency_seconds":4.669878,"instance_count":120,"reward_curve":[-2.4,2.4,-1.0,16.0,-2.0,2.4,5.0,-5.0,8.0,-7.2,4.4,-10.4,5.4,5.0,-1.0,-11.2,4.4,-2.4,5.6,10.4,2.0,-3.0,2.0,1.0,1.0,2.0,0.5,-2.0,0.5,-1.0,2.0,0.5,1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,4.0,-1.0,-2.0,-2.0,1.0,2.0,-5.0,2.0,2.0,1.0,2.0,2.4,2.4,-3.0,-2.4,2.0,4.4,2.4,-2.0,-4.0,-2.4,1.0,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,1.0,4.0,0.5,-2.0,-1.0,-1.0,-2.0,0.5,-1.0,2.0,-2.0,0.5,4.0,1.0,2.0,-2.0,1.0,2.0,1.0,1.0,0.5,1.0,-4.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-1.0,1.0,-1.0,2.0,0.5,-11.0,2.0,3.0,2.0,1.0,2.0,3.0,-3.0,2.0,6.0,2.0,0.5,3.0],"baseline_reward_curve":[-1.0,2.0,-2.0,9.0,-2.0,2.0,4.0,-2.0,11.4,-4.0,8.0,-6.0,10.0,7.0,-4.0,-2.0,3.0,-3.0,3.0,4.0,2.0,-5.0,-0.5,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-3.0,1.0,-2.0,-5.0,2.0,2.0,-0.5,2.0,3.0,1.0,-9.0,-12.0,2.0,4.0,2.0,-2.0,-2.0,-1.0,-0.5,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,2.0,8.0,0.5,-4.0,-10.0,1.0,-0.5,0.5,-2.0,4.0,-3.0,0.5,5.0,1.0,3.0,-2.0,1.0,16.0,1.0,-1.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-3.0,1.0,2.0,2.0,0.5,-13.0,2.0,2.0,5.0,-1.0,2.0,-3.0,-4.0,2.0,47.0,5.0,0.5,-4.0],"gain_curve":[-1.4,0.3999999999999999,1.0,7.0,0.0,0.3999999999999999,1.0,-3.0,-3.4000000000000004,-3.2,-3.5999999999999996,-4.4,-4.6,-2.0,3.0,-9.2,1.4000000000000004,0.6000000000000001,2.5999999999999996,6.4,0.0,2.0,2.5,2.0,0.0,2.5,0.0,-1.5,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,1.5,0.0,-0.6000000000000001,1.4,6.0,9.6,0.0,0.40000000000000036,0.3999999999999999,0.0,-2.0,-1.4,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-4.0,0.0,2.0,9.0,-2.0,-1.5,0.0,1.0,-2.0,1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,-14.0,0.0,2.0,0.0,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-3.0,0.0,0.0,2.0,0.0,1.0,-3.0,2.0,0.0,6.0,1.0,0.0,-41.0,-3.0,0.0,7.0],"cost_curve":[0.0172175,0.0243515,0.042352,0.0516525,0.05307,0.048665,0.0539425,0.04945,0.0500775,0.0556225,0.0568525,0.047285,0.05708,0.0421045,0.048512,0.045717,0.059075,0.0578275,0.0413835,0.046614,0.013877,0.0691695,0.0354575,0.0191875,0.0497645,0.0350525,0.0,0.035695,0.0,0.01908,0.0276615,0.0,0.0067275,0.0459435,0.0,0.0,0.031835,0.045896,0.01493,0.0606495,0.03509,0.036375,0.0236635,0.0230735,0.0239035,0.042054,0.0327265,0.059605,0.0192625,0.0334565,0.045029,0.0494915,0.044998,0.040915,0.0478425,0.045223,0.033131,0.0319665,0.032686,0.03658,0.0323515,0.0,0.0,0.0564465,0.0331415,0.0075125,0.019955,0.016464,0.0200125,0.047372,0.032224,0.059364,0.0,0.033966,0.0426935,0.0330765,0.0327135,0.0,0.049042,0.0478895,0.033471,0.0,0.048537,0.0202875,0.015441,0.049676,0.00758,0.0390865,0.0154735,0.0159285,0.0,0.015656,0.043485,0.0358245,0.0,0.0,0.0195325,0.0326615,0.0325815,0.0150765,0.0,0.0319255,0.0204925,0.031769,0.02065,0.029238,0.0,0.070666,0.0436765,0.0337745,0.0209925,0.019795,0.028808,0.033645,0.0422215,0.014729,0.051883,0.04294,0.0,0.02516]},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","run_index":1,"reward":230.39999999999998,"baseline_reward":90.4,"reference_reward":1138.5,"gain":139.99999999999997,"normalized_reward":0.09632799283510796,"normalized_gain":0.13357504054956587,"cost_usd":3.878048,"latency_seconds":4.658531,"instance_count":120,"reward_curve":[11.2,2.4,1.0,-1.0,2.0,10.8,-1.0,-1.0,5.2,5.2,-2.0,11.2,5.2,-11.2,5.2,-11.2,-3.0,-1.0,-5.2,2.4,-2.0,-2.0,0.0,1.0,3.0,0.5,78.0,-2.0,2.0,0.5,2.0,2.0,0.5,2.0,-1.0,0.5,-1.0,-1.0,-1.0,0.5,-2.0,1.0,-5.0,1.0,2.0,78.0,-3.0,-5.0,2.0,1.0,-10.4,5.2,-1.0,-2.0,-2.0,2.4,1.0,1.0,1.0,-1.0,3.0,1.0,-1.0,4.0,-2.0,-2.0,0.5,0.5,-2.0,2.0,0.5,1.0,1.0,1.0,1.0,-2.0,1.0,1.0,-2.0,-1.0,0.5,0.5,1.0,0.5,-3.0,2.0,7.0,-3.0,-3.0,1.0,0.5,1.0,1.0,1.0,1.0,-2.0,0.5,2.0,5.0,-1.0,-79.0,1.0,2.0,0.5,2.0,2.0,-4.0,1.0,0.5,4.0,2.0,2.0,100.0,3.0,4.0,0.5,2.0,2.0,-3.0,-2.0],"baseline_reward_curve":[-1.0,2.0,-2.0,9.0,-2.0,2.0,4.0,-2.0,11.4,-4.0,8.0,-6.0,10.0,7.0,-4.0,-2.0,3.0,-3.0,3.0,4.0,2.0,-5.0,-0.5,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-3.0,1.0,-2.0,-5.0,2.0,2.0,-0.5,2.0,3.0,1.0,-9.0,-12.0,2.0,4.0,2.0,-2.0,-2.0,-1.0,-0.5,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,2.0,8.0,0.5,-4.0,-10.0,1.0,-0.5,0.5,-2.0,4.0,-3.0,0.5,5.0,1.0,3.0,-2.0,1.0,16.0,1.0,-1.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-3.0,1.0,2.0,2.0,0.5,-13.0,2.0,2.0,5.0,-1.0,2.0,-3.0,-4.0,2.0,47.0,5.0,0.5,-4.0],"gain_curve":[12.2,0.3999999999999999,3.0,-10.0,4.0,8.8,-5.0,1.0,-6.2,9.2,-10.0,17.2,-4.8,-18.2,9.2,-9.2,-6.0,2.0,-8.2,-1.6,-4.0,3.0,0.5,2.0,2.0,1.0,77.5,-1.5,1.5,1.5,0.0,1.5,1.5,5.0,-1.5,0.0,1.0,-1.0,-2.0,-9.5,-1.0,3.0,-2.0,0.0,4.0,83.0,-5.0,-7.0,2.5,-1.0,-13.4,4.2,8.0,10.0,-4.0,-1.6,-1.0,3.0,3.0,0.0,3.5,0.5,-1.5,5.0,-3.0,-3.0,-0.5,-0.5,-3.0,4.0,-1.5,-7.0,0.5,5.0,11.0,-3.0,1.5,0.5,0.0,-5.0,3.5,0.0,-4.0,-0.5,-6.0,4.0,6.0,-19.0,-4.0,2.0,0.0,3.0,4.0,3.0,0.5,-2.5,-0.5,0.0,3.0,-2.0,-79.5,-1.0,5.0,-0.5,0.0,0.0,-4.5,14.0,-1.5,2.0,-3.0,3.0,98.0,6.0,8.0,-1.5,-45.0,-3.0,-3.5,2.0],"cost_curve":[0.01766,0.030825,0.04355,0.043166,0.056315,0.0508775,0.0544775,0.047695,0.0470175,0.0563625,0.033531,0.047088,0.05519,0.0584375,0.0567,0.056905,0.0511855,0.05631,0.0467095,0.0518275,0.0306075,0.034315,0.058255,0.0307675,0.0435065,0.0,0.030918,0.037335,0.0375475,0.0,0.0376075,0.0368775,0.0,0.083744,0.0195075,0.0,0.0278635,0.015299,0.051396,0.0,0.032449,0.0283455,0.074955,0.054244,0.0243685,0.064304,0.0564165,0.0897,0.0322775,0.0071575,0.0519025,0.0298255,0.042843,0.0445725,0.0347695,0.0305295,0.043708,0.0337825,0.0292935,0.043138,0.0557915,0.014899,0.032699,0.048323,0.0328,0.0532555,0.0,0.0,0.0424165,0.015484,0.0,0.0158815,0.020375,0.0201575,0.0154345,0.056892,0.0069225,0.014812,0.0541355,0.0329495,0.0,0.0,0.0073075,0.0,0.0488395,0.016304,0.0324,0.0329195,0.04871,0.0338345,0.0,0.0071925,0.0070025,0.007655,0.0345665,0.0237355,0.0,0.0225135,0.0818365,0.0148645,0.0641235,0.01411,0.023269,0.0,0.023004,0.022669,0.0369045,0.022012,0.0,0.041998,0.0270515,0.03119,0.0349105,0.032055,0.0226065,0.0,0.01528,0.0193275,0.074679,0.033989]},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","run_index":2,"reward":49.0,"baseline_reward":90.4,"reference_reward":1138.5,"gain":-41.400000000000006,"normalized_reward":-0.084187481341427,"normalized_gain":-0.03950004770537163,"cost_usd":3.57735,"latency_seconds":5.084836,"instance_count":120,"reward_curve":[4.0,-2.0,-4.0,2.0,-2.0,3.0,-3.0,-5.0,-4.0,2.0,5.0,-1.0,-3.0,4.0,3.0,2.0,3.0,-2.0,3.0,3.0,0.0,-0.5,1.0,-1.0,-0.5,0.5,1.0,-1.0,0.5,0.5,-1.0,1.0,-1.0,-5.0,-1.0,0.5,-1.0,-1.0,-1.0,1.0,-1.0,10.0,-1.0,10.0,-1.0,0.5,0.0,-2.0,-1.0,-1.0,2.0,3.0,-5.0,-5.0,-2.0,2.0,6.0,-2.0,4.0,-2.0,0.5,8.0,0.5,-1.0,0.5,1.0,1.0,-2.0,-3.0,2.0,1.0,2.0,1.0,-1.0,-1.0,3.0,1.0,1.0,-1.0,1.0,1.0,1.0,0.5,2.0,9.0,0.5,0.5,-2.0,-2.0,1.0,1.0,-6.0,1.0,0.5,1.0,-1.0,1.0,-2.0,3.0,-1.0,0.5,3.0,-11.0,-1.0,-2.0,-1.0,5.0,0.5,-1.0,12.0,2.0,-1.0,-1.0,-1.0,-1.0,0.5,2.0,0.5,1.0,4.0],"baseline_reward_curve":[-1.0,2.0,-2.0,9.0,-2.0,2.0,4.0,-2.0,11.4,-4.0,8.0,-6.0,10.0,7.0,-4.0,-2.0,3.0,-3.0,3.0,4.0,2.0,-5.0,-0.5,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-3.0,1.0,-2.0,-5.0,2.0,2.0,-0.5,2.0,3.0,1.0,-9.0,-12.0,2.0,4.0,2.0,-2.0,-2.0,-1.0,-0.5,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,2.0,8.0,0.5,-4.0,-10.0,1.0,-0.5,0.5,-2.0,4.0,-3.0,0.5,5.0,1.0,3.0,-2.0,1.0,16.0,1.0,-1.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-3.0,1.0,2.0,2.0,0.5,-13.0,2.0,2.0,5.0,-1.0,2.0,-3.0,-4.0,2.0,47.0,5.0,0.5,-4.0],"gain_curve":[5.0,-4.0,-2.0,-7.0,0.0,1.0,-7.0,-3.0,-15.4,6.0,-3.0,5.0,-13.0,-3.0,7.0,4.0,0.0,1.0,0.0,-1.0,-2.0,4.5,1.5,0.0,-1.5,1.0,0.5,-0.5,0.0,1.5,-3.0,0.5,0.0,-2.0,-1.5,0.0,1.0,-1.0,-2.0,-9.0,0.0,12.0,2.0,9.0,1.0,5.5,-2.0,-4.0,-0.5,-3.0,-1.0,2.0,4.0,7.0,-4.0,-2.0,4.0,0.0,6.0,-1.0,1.0,7.5,0.0,0.0,-0.5,0.0,0.0,-3.0,-4.0,4.0,-1.0,-6.0,0.5,3.0,9.0,2.0,1.5,0.5,1.0,-3.0,4.0,0.5,-4.5,1.0,6.0,2.5,-0.5,-18.0,-3.0,2.0,0.5,-4.0,4.0,2.5,0.5,-1.5,0.0,-4.0,1.0,-2.0,0.0,1.0,-8.0,-2.0,-4.0,-3.0,4.5,13.5,-3.0,10.0,-3.0,0.0,-3.0,2.0,3.0,-1.5,-45.0,-4.5,0.5,8.0],"cost_curve":[0.0162445,0.0345885,0.04881,0.051365,0.0502275,0.05139,0.05276,0.0570675,0.0522925,0.0527625,0.0547675,0.05079,0.049385,0.0509875,0.054435,0.041826,0.048553,0.0426405,0.054635,0.0559175,0.0426505,0.0076275,0.00678,0.0187425,0.00717,0.0,0.0141175,0.0185125,0.0,0.0,0.0071975,0.051385,0.042598,0.0922035,0.0295475,0.0,0.0186825,0.0183175,0.0180975,0.018415,0.0188375,0.05392,0.014362,0.0596525,0.018395,0.0,0.0456435,0.0280115,0.0189125,0.0192425,0.0515805,0.0300045,0.043985,0.044349,0.0299995,0.054457,0.039039,0.0309435,0.0320435,0.044662,0.0,0.042498,0.0,0.0462735,0.0,0.0148695,0.0149695,0.031488,0.052751,0.0339845,0.007445,0.0499575,0.015879,0.038871,0.0440325,0.0203275,0.015489,0.0072725,0.04809,0.049399,0.007575,0.0161165,0.0,0.05341,0.0547975,0.0,0.0,0.0493715,0.0392535,0.015792,0.016804,0.050219,0.016459,0.0,0.0161265,0.02026,0.032749,0.0424765,0.0148315,0.00769,0.0,0.0477825,0.057075,0.015309,0.0235845,0.034869,0.0659865,0.0,0.015609,0.073271,0.023312,0.046525,0.0207725,0.015074,0.015269,0.0,0.0241045,0.0,0.0151965,0.0345515]},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","run_index":3,"reward":-15.5,"baseline_reward":90.4,"reference_reward":1138.5,"gain":-105.9,"normalized_reward":-0.14837297243506817,"normalized_gain":-0.10103997710142164,"cost_usd":3.371876,"latency_seconds":4.586837,"instance_count":120,"reward_curve":[3.0,12.0,2.0,4.0,-2.0,-7.0,5.0,-3.0,-4.0,4.0,-4.0,2.0,-1.0,1.0,-2.0,-2.0,6.0,-2.0,5.0,13.0,-0.5,-0.5,-1.0,1.0,0.0,-2.0,2.0,-1.0,-1.0,-2.0,1.0,0.5,0.5,1.0,0.5,-0.5,0.0,-3.0,2.0,10.0,10.0,2.0,-2.0,-0.5,0.5,-1.0,1.0,-5.0,0.5,1.0,-9.0,-4.0,2.0,4.0,-0.5,2.0,-6.0,-2.0,4.0,-2.0,1.0,3.0,0.5,16.0,-2.0,1.0,-2.0,2.0,-1.0,0.5,-2.0,4.0,0.5,-1.0,1.0,-4.0,-0.5,-0.5,0.5,-4.0,4.0,0.5,0.5,-2.0,-2.0,1.0,1.0,1.0,0.5,-2.0,1.0,-2.0,6.0,-2.0,4.0,-100.0,12.0,-2.0,0.5,0.5,2.0,2.0,-1.0,0.5,1.0,2.0,1.0,-1.0,5.0,2.0,0.5,5.0,-3.0,-1.0,-1.0,1.0,2.0,2.0,-3.0,3.0],"baseline_reward_curve":[-1.0,2.0,-2.0,9.0,-2.0,2.0,4.0,-2.0,11.4,-4.0,8.0,-6.0,10.0,7.0,-4.0,-2.0,3.0,-3.0,3.0,4.0,2.0,-5.0,-0.5,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-3.0,1.0,-2.0,-5.0,2.0,2.0,-0.5,2.0,3.0,1.0,-9.0,-12.0,2.0,4.0,2.0,-2.0,-2.0,-1.0,-0.5,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,2.0,8.0,0.5,-4.0,-10.0,1.0,-0.5,0.5,-2.0,4.0,-3.0,0.5,5.0,1.0,3.0,-2.0,1.0,16.0,1.0,-1.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-3.0,1.0,2.0,2.0,0.5,-13.0,2.0,2.0,5.0,-1.0,2.0,-3.0,-4.0,2.0,47.0,5.0,0.5,-4.0],"gain_curve":[4.0,10.0,4.0,-5.0,0.0,-9.0,1.0,-1.0,-15.4,8.0,-12.0,8.0,-11.0,-6.0,2.0,0.0,3.0,1.0,2.0,9.0,-2.5,4.5,-0.5,2.0,-1.0,-1.5,1.5,-0.5,-1.5,-1.0,-1.0,0.0,1.5,4.0,0.0,-1.0,2.0,-3.0,1.0,0.0,11.0,4.0,1.0,-1.5,2.5,4.0,-1.0,-7.0,1.0,-1.0,-12.0,-5.0,11.0,16.0,-2.5,-2.0,-8.0,0.0,6.0,-1.0,1.5,2.5,0.0,17.0,-3.0,0.0,-3.0,1.0,-2.0,2.5,-4.0,-4.0,0.0,3.0,11.0,-5.0,0.0,-1.0,2.5,-8.0,7.0,0.0,-4.5,-3.0,-5.0,3.0,0.0,-15.0,-0.5,-1.0,0.5,0.0,9.0,0.0,3.5,-100.5,11.0,-4.0,-1.5,-0.5,1.5,0.0,2.0,-0.5,-1.0,0.0,0.5,12.0,3.0,0.0,-4.5,6.0,-5.0,2.0,3.0,-1.0,-45.0,-3.0,-3.5,7.0],"cost_curve":[0.0202075,0.037213,0.0526525,0.0468615,0.048948,0.0465935,0.0557375,0.052097,0.0558725,0.0436355,0.0567475,0.0475455,0.049375,0.0468955,0.0501725,0.05563,0.0503875,0.052115,0.043428,0.04034,0.006695,0.0067725,0.029125,0.0176425,0.040629,0.02648,0.0316225,0.0182275,0.0182675,0.0266695,0.0067175,0.0,0.0,0.01905,0.0,0.007145,0.048977,0.052936,0.0228625,0.042868,0.0521105,0.0249805,0.028988,0.0063775,0.0,0.0176575,0.0171825,0.069725,0.0,0.0213555,0.045988,0.0449885,0.0397505,0.033131,0.00656,0.0276245,0.039989,0.0293635,0.029401,0.0287685,0.01302,0.01364,0.0,0.040461,0.0466355,0.0131025,0.029319,0.040288,0.0341915,0.0,0.0335015,0.032274,0.0,0.0372935,0.025063,0.042121,0.0065825,0.00658,0.0,0.0427705,0.036896,0.0,0.0,0.05145,0.041548,0.0136925,0.0130875,0.0064525,0.0,0.040426,0.025093,0.041328,0.0290195,0.033014,0.0250455,0.020845,0.0609405,0.036755,0.0,0.0,0.03875,0.038871,0.0135425,0.0,0.0177175,0.019628,0.0133775,0.01769,0.041698,0.020433,0.0,0.0562735,0.037509,0.028435,0.0176175,0.0228065,0.0197885,0.0206055,0.045197,0.030416]},{"run_name":"mem0-gpt-5.4","task":"exploitable_poker","run_index":4,"reward":37.0,"baseline_reward":90.4,"reference_reward":1138.5,"gain":-53.400000000000006,"normalized_reward":-0.09612896805652303,"normalized_gain":-0.05094933689533442,"cost_usd":3.2758135,"latency_seconds":4.350082,"instance_count":120,"reward_curve":[7.0,-10.0,7.0,9.0,-2.0,4.5,4.0,4.5,2.0,-3.0,-9.0,-2.0,4.5,9.0,-9.0,-4.5,-5.0,2.0,4.5,-2.0,1.0,0.5,2.0,1.0,-1.0,2.0,2.0,-2.0,-5.0,10.0,-5.0,-1.0,0.5,2.0,-2.0,-4.0,-3.0,0.0,-1.0,2.0,0.5,1.0,1.0,0.5,0.5,2.0,1.0,2.0,2.0,-1.0,-5.5,-7.5,-5.0,-1.0,2.0,1.0,-2.0,4.0,1.0,2.0,1.0,5.0,1.0,0.5,-4.0,-2.0,-2.0,1.0,3.0,0.5,-2.0,3.0,1.0,-4.0,3.0,2.0,-1.0,0.5,0.5,1.0,0.5,-3.0,1.0,-2.0,-2.0,0.5,2.0,-0.5,12.0,1.0,0.5,1.0,1.0,-2.0,1.0,1.0,-1.0,-1.0,-1.0,-4.0,0.5,-1.0,2.0,2.0,-3.0,-1.0,15.0,0.5,0.5,2.0,2.0,1.0,3.0,3.0,0.5,2.0,-1.0,-11.0,-4.0,2.0],"baseline_reward_curve":[-1.0,2.0,-2.0,9.0,-2.0,2.0,4.0,-2.0,11.4,-4.0,8.0,-6.0,10.0,7.0,-4.0,-2.0,3.0,-3.0,3.0,4.0,2.0,-5.0,-0.5,-1.0,1.0,-0.5,0.5,-0.5,0.5,-1.0,2.0,0.5,-1.0,-3.0,0.5,0.5,-2.0,0.0,1.0,10.0,-1.0,-2.0,-3.0,1.0,-2.0,-5.0,2.0,2.0,-0.5,2.0,3.0,1.0,-9.0,-12.0,2.0,4.0,2.0,-2.0,-2.0,-1.0,-0.5,0.5,0.5,-1.0,1.0,1.0,1.0,1.0,1.0,-2.0,2.0,8.0,0.5,-4.0,-10.0,1.0,-0.5,0.5,-2.0,4.0,-3.0,0.5,5.0,1.0,3.0,-2.0,1.0,16.0,1.0,-1.0,0.5,-2.0,-3.0,-2.0,0.5,0.5,1.0,2.0,2.0,1.0,0.5,2.0,-3.0,1.0,2.0,2.0,0.5,-13.0,2.0,2.0,5.0,-1.0,2.0,-3.0,-4.0,2.0,47.0,5.0,0.5,-4.0],"gain_curve":[8.0,-12.0,9.0,0.0,0.0,2.5,0.0,6.5,-9.4,1.0,-17.0,4.0,-5.5,2.0,-5.0,-2.5,-8.0,5.0,1.5,-6.0,-1.0,5.5,2.5,2.0,-2.0,2.5,1.5,-1.5,-5.5,11.0,-7.0,-1.5,1.5,5.0,-2.5,-4.5,-1.0,0.0,-2.0,-8.0,1.5,3.0,4.0,-0.5,2.5,7.0,-1.0,0.0,2.5,-3.0,-8.5,-8.5,4.0,11.0,0.0,-3.0,-4.0,6.0,3.0,3.0,1.5,4.5,0.5,1.5,-5.0,-3.0,-3.0,0.0,2.0,2.5,-4.0,-5.0,0.5,0.0,13.0,1.0,-0.5,0.0,2.5,-3.0,3.5,-3.5,-4.0,-3.0,-5.0,2.5,1.0,-16.5,11.0,2.0,0.0,3.0,4.0,0.0,0.5,0.5,-2.0,-3.0,-3.0,-5.0,0.0,-3.0,5.0,1.0,-5.0,-3.0,14.5,13.5,-1.5,0.0,-3.0,2.0,1.0,6.0,4.5,0.0,-48.0,-16.0,-4.5,6.0],"cost_curve":[0.014837,0.029657,0.0460375,0.0340575,0.04943,0.0440135,0.039066,0.0427995,0.04971,0.0445965,0.044116,0.0444085,0.046781,0.0495625,0.0387725,0.052325,0.036085,0.046798,0.039814,0.0540925,0.0129905,0.0,0.020082,0.027816,0.0125035,0.0268135,0.026321,0.0219475,0.0906885,0.04865,0.0577665,0.016705,0.0,0.0313725,0.030385,0.05344,0.043942,0.047366,0.023202,0.028131,0.0,0.023809,0.0275685,0.0,0.0,0.0350035,0.0056125,0.026354,0.040995,0.0159125,0.0410195,0.0456365,0.030142,0.0420045,0.0308135,0.026161,0.0281205,0.0288065,0.0403175,0.031866,0.0222,0.0331475,0.0159675,0.0,0.0337015,0.0380115,0.026247,0.01555,0.04209,0.0,0.0299135,0.0154675,0.019339,0.0338615,0.031467,0.034132,0.0385765,0.0,0.0,0.015685,0.0,0.026832,0.0055825,0.026537,0.037291,0.0,0.0158725,0.0061525,0.0257905,0.026019,0.0,0.0154175,0.0163225,0.0276625,0.0162775,0.012691,0.01569,0.030485,0.012001,0.050963,0.0,0.012299,0.0267585,0.027948,0.052513,0.0058775,0.061997,0.0,0.0,0.049382,0.012739,0.0368415,0.0153625,0.0151075,0.0,0.0361605,0.0121615,0.0423945,0.049839,0.018264]},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","run_index":0,"reward":7.7959000000000005,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain":3.0454000000000017,"normalized_reward":0.34338638386930526,"normalized_gain":0.4200841437340508,"cost_usd":2.694474,"latency_seconds":8.757407,"instance_count":12,"reward_curve":[0.4308,0.5912,0.6638,0.6592,0.6255,0.7057,0.6123,0.6793,0.6911,0.7169,0.7605,0.6596],"baseline_reward_curve":[0.5084,0.3314,0.4677,0.4468,0.4008,0.409,0.5115,0.1124,0.4237,0.5387,0.3138,0.2863],"gain_curve":[-0.07759999999999995,0.2598,0.19609999999999994,0.21240000000000003,0.22469999999999996,0.2967,0.1008,0.5669,0.2674,0.17820000000000003,0.44669999999999993,0.37329999999999997],"cost_curve":[0.1614955,0.185833,0.2447855,0.220731,0.124568,0.1830025,0.193904,0.357627,0.288847,0.3134045,0.191166,0.22911]},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","run_index":1,"reward":8.3233,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain":3.572800000000001,"normalized_reward":0.42575788339294357,"normalized_gain":0.4928339885509346,"cost_usd":1.9930825,"latency_seconds":9.678059,"instance_count":12,"reward_curve":[0.1997,0.5456,0.6073,0.6834,0.6775,0.8041,0.7779,0.6792,0.8592,0.9087,0.8137,0.767],"baseline_reward_curve":[0.5084,0.3314,0.4677,0.4468,0.4008,0.409,0.5115,0.1124,0.4237,0.5387,0.3138,0.2863],"gain_curve":[-0.3087,0.2142,0.13959999999999995,0.23660000000000003,0.2767,0.39510000000000006,0.2664000000000001,0.5668,0.43549999999999994,0.37,0.49989999999999996,0.4807],"cost_curve":[0.130464,0.188065,0.188786,0.2941835,0.185759,0.1671955,0.0625425,0.1454395,0.121487,0.157014,0.1939085,0.158238]},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","run_index":2,"reward":8.0766,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain":3.3261000000000003,"normalized_reward":0.38722726349821157,"normalized_gain":0.4588040554521001,"cost_usd":2.442802,"latency_seconds":10.07169,"instance_count":12,"reward_curve":[0.5679,0.5732,0.6114,0.7015,0.7093,0.5922,0.751,0.7535,0.7404,0.7775,0.6813,0.6174],"baseline_reward_curve":[0.5084,0.3314,0.4677,0.4468,0.4008,0.409,0.5115,0.1124,0.4237,0.5387,0.3138,0.2863],"gain_curve":[0.0595,0.24180000000000007,0.14370000000000005,0.25470000000000004,0.30850000000000005,0.18319999999999997,0.23950000000000005,0.6411,0.3166999999999999,0.2388,0.3675,0.33109999999999995],"cost_curve":[0.1385495,0.172545,0.148782,0.2054345,0.1882305,0.215678,0.253113,0.169177,0.2249135,0.3271175,0.2036205,0.195641]},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","run_index":3,"reward":7.145900000000001,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain":2.395400000000002,"normalized_reward":0.2418667124806724,"normalized_gain":0.33042278777846773,"cost_usd":3.1072625,"latency_seconds":11.241982,"instance_count":12,"reward_curve":[0.4027,0.6099,0.8023,0.7384,0.6595,0.5979,0.5609,0.5307,0.5668,0.5822,0.5315,0.5631],"baseline_reward_curve":[0.5084,0.3314,0.4677,0.4468,0.4008,0.409,0.5115,0.1124,0.4237,0.5387,0.3138,0.2863],"gain_curve":[-0.10569999999999996,0.2785,0.3346,0.29159999999999997,0.2587,0.1889,0.0494,0.41829999999999995,0.14309999999999995,0.043500000000000094,0.21769999999999995,0.27680000000000005],"cost_curve":[0.177207,0.1633805,0.209073,0.217997,0.2736745,0.3251185,0.327073,0.3742375,0.2528515,0.252853,0.33641,0.197387]},{"run_name":"mem0-gpt-5.4","task":"sales_prediction","run_index":4,"reward":7.7323,"baseline_reward":4.750499999999999,"reference_reward":12.0,"gain":2.9818000000000016,"normalized_reward":0.3334530744842021,"normalized_gain":0.411311124905166,"cost_usd":3.7049155,"latency_seconds":10.726946,"instance_count":12,"reward_curve":[0.3,0.593,0.667,0.621,0.7351,0.7231,0.8323,0.7416,0.6757,0.6619,0.6632,0.5184],"baseline_reward_curve":[0.5084,0.3314,0.4677,0.4468,0.4008,0.409,0.5115,0.1124,0.4237,0.5387,0.3138,0.2863],"gain_curve":[-0.20839999999999997,0.2616,0.19930000000000003,0.17420000000000002,0.3343,0.3141,0.3208000000000001,0.6292,0.25199999999999995,0.12320000000000009,0.3494,0.23209999999999997],"cost_curve":[0.162294,0.183315,0.275472,0.2292155,0.317247,0.28403,0.3764955,0.4011975,0.41074,0.4158435,0.310591,0.3384745]}],"warnings":{},"normalization":{"reward":"(reward - stateless baseline of gpt-5.4) / (reference_max - stateless baseline of gpt-5.4)","gain":"gain / (reference_max - stateless baseline of that system)","reference_max":"Task-defined mean per-instance maximum reward from each task class's r_max."}}
