|
@@ -0,0 +1,61 @@
|
|
|
|
+Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simple AST,Java Simple AST,JavaScript Simple AST,Multiple AST,Parallel AST,Parallel Multiple AST,Simple Exec,Python Simple Exec,REST Simple Exec,Multiple Exec,Parallel Exec,Parallel Multiple Exec,Irrelevance Detection
|
|
|
|
+1,Gemini-1.5-Pro-002 (Prompt),89.63%,88.96%,91.77%,79.83%,94.50%,65.00%,80.00%,94.00%,93.00%,89.00%,98.57%,100.00%,97.14%,96.00%,90.00%,82.50%,83.75%
|
|
|
|
+2,ToolACE-8B (FC),88.94%,87.06%,89.52%,76.25%,89.75%,65.00%,74.00%,93.00%,90.00%,89.00%,98.57%,100.00%,97.14%,94.00%,88.00%,77.50%,94.17%
|
|
|
|
+3,GPT-4-turbo-2024-04-09 (Prompt),88.80%,91.46%,90.00%,82.33%,97.00%,68.00%,82.00%,95.00%,95.00%,93.50%,99.50%,99.00%,100.00%,98.00%,80.00%,82.50%,73.33%
|
|
|
|
+4,GPT-4o-mini-2024-07-18 (Prompt),88.69%,86.23%,91.12%,79.42%,93.25%,65.00%,80.00%,93.00%,86.50%,86.00%,100.00%,100.00%,100.00%,96.00%,86.00%,82.50%,88.75%
|
|
|
|
+5,Hammer2.0-7b (FC),88.54%,90.27%,89.25%,80.58%,97.75%,66.00%,78.00%,95.00%,93.50%,92.00%,90.00%,100.00%,80.00%,94.00%,88.00%,85.00%,78.75%
|
|
|
|
+6,Gemini-1.5-Flash-002 (Prompt),87.60%,86.58%,89.48%,75.33%,95.00%,63.00%,68.00%,91.50%,91.50%,88.00%,95.93%,99.00%,92.86%,96.00%,86.00%,80.00%,84.17%
|
|
|
|
+7,xLAM-8x22b-r (FC),87.51%,88.15%,90.11%,81.08%,95.25%,66.00%,82.00%,93.00%,91.50%,87.00%,96.43%,100.00%,92.86%,96.00%,88.00%,80.00%,74.58%
|
|
|
|
+8,Llama-3.1-70B-Instruct (Prompt),87.50%,88.90%,89.34%,76.58%,95.75%,60.00%,74.00%,95.50%,93.50%,90.00%,91.36%,97.00%,85.71%,96.00%,90.00%,80.00%,74.58%
|
|
|
|
+9,Gemma-2-27b-it (Prompt),87.39%,88.52%,87.89%,81.08%,95.25%,64.00%,84.00%,92.50%,91.00%,89.50%,83.57%,100.00%,67.14%,96.00%,92.00%,80.00%,80.83%
|
|
|
|
+10,Gemini-1.5-Pro-001 (Prompt),86.17%,83.88%,87.52%,73.00%,91.00%,60.00%,68.00%,91.50%,88.00%,83.00%,91.57%,96.00%,87.14%,94.00%,82.00%,82.50%,90.00%
|
|
|
|
+11,GPT-4o-2024-08-06 (FC),86.15%,85.90%,85.64%,74.58%,91.75%,64.00%,68.00%,92.50%,92.00%,84.50%,87.07%,97.00%,77.14%,92.00%,86.00%,77.50%,89.17%
|
|
|
|
+12,Gemini-1.5-Flash-001 (Prompt),85.74%,86.17%,87.68%,73.17%,89.50%,64.00%,66.00%,90.50%,92.00%,89.00%,84.21%,97.00%,71.43%,94.00%,90.00%,82.50%,76.25%
|
|
|
|
+13,Qwen2.5-7B-Instruct (Prompt),85.58%,85.79%,88.13%,75.67%,96.00%,59.00%,72.00%,96.00%,88.50%,83.00%,94.50%,99.00%,90.00%,92.00%,86.00%,80.00%,74.58%
|
|
|
|
+14,Meta-Llama-3-70B-Instruct (Prompt),85.10%,87.17%,89.21%,75.17%,95.50%,60.00%,70.00%,95.50%,90.50%,87.50%,95.86%,96.00%,95.71%,96.00%,80.00%,85.00%,60.42%
|
|
|
|
+15,Gorilla-OpenFunctions-v2 (FC),84.81%,86.29%,86.09%,77.67%,95.00%,62.00%,76.00%,95.00%,89.00%,83.50%,95.86%,96.00%,95.71%,96.00%,80.00%,72.50%,73.75%
|
|
|
|
+16,Granite-20b-FunctionCalling (FC),84.64%,82.33%,85.91%,72.83%,90.50%,66.00%,62.00%,91.50%,84.50%,80.50%,85.64%,97.00%,74.29%,92.00%,86.00%,80.00%,88.75%
|
|
|
|
+17,GPT-4-turbo-2024-04-09 (FC),84.55%,84.67%,84.32%,69.17%,92.50%,59.00%,56.00%,91.00%,90.50%,88.00%,88.29%,98.00%,78.57%,88.00%,86.00%,75.00%,85.00%
|
|
|
|
+18,Gemma-2-9b-it (Prompt),84.52%,84.38%,85.18%,74.50%,93.50%,60.00%,70.00%,92.00%,88.00%,83.00%,84.21%,97.00%,71.43%,94.00%,90.00%,72.50%,82.50%
|
|
|
|
+19,Hammer2.0-1.5b (FC),84.44%,84.06%,88.95%,75.25%,94.75%,65.00%,66.00%,90.50%,88.00%,82.50%,93.29%,98.00%,88.57%,92.00%,88.00%,82.50%,67.92%
|
|
|
|
+20,o1-mini-2024-09-12 (Prompt),83.84%,81.31%,84.00%,73.75%,88.25%,61.00%,72.00%,90.00%,81.00%,80.50%,88.50%,97.00%,80.00%,92.00%,78.00%,77.50%,93.33%
|
|
|
|
+21,GPT-4o-mini-2024-07-18 (FC),83.72%,84.25%,84.12%,73.50%,90.50%,64.00%,66.00%,90.50%,90.00%,83.00%,83.50%,97.00%,70.00%,92.00%,86.00%,75.00%,80.00%
|
|
|
|
+22,Command-R-Plus (Prompt) (Original),82.19%,80.90%,85.07%,71.08%,89.25%,60.00%,64.00%,91.50%,82.00%,79.00%,93.29%,98.00%,88.57%,90.00%,82.00%,75.00%,75.83%
|
|
|
|
+23,mistral-large-2407 (FC),81.41%,86.62%,84.57%,73.00%,96.00%,57.00%,66.00%,92.00%,91.50%,90.00%,73.79%,99.00%,48.57%,94.00%,88.00%,82.50%,47.92%
|
|
|
|
+24,Llama-3.1-8B-Instruct (Prompt),81.15%,83.62%,87.29%,73.00%,94.00%,59.00%,66.00%,94.50%,83.50%,83.50%,85.64%,97.00%,74.29%,96.00%,90.00%,77.50%,46.67%
|
|
|
|
+25,xLAM-7b-r (FC),80.86%,81.40%,83.46%,73.08%,91.25%,56.00%,72.00%,93.50%,79.50%,79.50%,76.86%,98.00%,55.71%,92.00%,90.00%,75.00%,68.33%
|
|
|
|
+26,Hermes-2-Pro-Llama-3-70B (FC),78.81%,78.85%,80.45%,59.92%,83.75%,54.00%,42.00%,80.00%,88.00%,87.50%,76.29%,94.00%,58.57%,82.00%,86.00%,77.50%,72.08%
|
|
|
|
+27,GPT-3.5-Turbo-0125 (FC),78.52%,84.12%,84.11%,75.50%,94.50%,64.00%,68.00%,93.00%,88.00%,80.00%,95.43%,98.00%,92.86%,90.00%,86.00%,65.00%,33.75%
|
|
|
|
+28,Open-Mistral-Nemo-2407 (FC),78.29%,81.21%,77.04%,63.33%,92.00%,36.00%,62.00%,92.00%,86.50%,83.00%,55.64%,97.00%,14.29%,90.00%,90.00%,72.50%,71.67%
|
|
|
|
+29,Qwen2.5-1.5B-Instruct (Prompt),78.14%,75.19%,82.82%,70.25%,87.75%,55.00%,68.00%,85.50%,73.50%,71.50%,72.79%,97.00%,48.57%,94.00%,82.00%,82.50%,71.25%
|
|
|
|
+30,Qwen2-7B-Instruct (Prompt),75.50%,74.85%,81.70%,67.42%,84.25%,60.00%,58.00%,87.50%,71.00%,73.50%,86.79%,95.00%,78.57%,88.00%,82.00%,70.00%,53.33%
|
|
|
|
+31,Command-R-Plus (FC) (Original),75.47%,76.83%,78.61%,66.33%,87.00%,60.00%,52.00%,90.00%,82.00%,69.00%,88.93%,95.00%,82.86%,88.00%,80.00%,57.50%,57.50%
|
|
|
|
+32,Hermes-2-Pro-Llama-3-8B (FC),74.14%,76.54%,75.48%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,73.00%,69.93%,97.00%,42.86%,94.00%,78.00%,60.00%,59.17%
|
|
|
|
+33,Llama-3.2-3B-Instruct (Prompt),74.03%,77.77%,69.41%,64.08%,81.25%,49.00%,62.00%,90.00%,80.50%,76.50%,78.14%,82.00%,74.29%,92.00%,50.00%,57.50%,77.50%
|
|
|
|
+34,xLAM-8x7b-r (FC),73.93%,68.85%,78.43%,68.42%,79.25%,60.00%,66.00%,88.00%,63.50%,55.50%,87.71%,94.00%,81.43%,88.00%,68.00%,70.00%,76.25%
|
|
|
|
+35,Hermes-2-Pro-Mistral-7B (FC),69.78%,72.83%,77.30%,61.33%,86.00%,56.00%,42.00%,87.50%,78.50%,64.00%,61.71%,92.00%,31.43%,94.00%,86.00%,67.50%,27.50%
|
|
|
|
+36,DBRX-Instruct (Prompt),68.89%,67.04%,75.04%,72.17%,92.50%,54.00%,70.00%,91.50%,56.50%,48.00%,90.14%,96.00%,84.29%,88.00%,62.00%,60.00%,51.67%
|
|
|
|
+37,Hammer2.0-0.5b (FC),68.44%,66.79%,70.43%,62.17%,82.50%,52.00%,52.00%,80.00%,67.50%,57.50%,53.21%,95.00%,11.43%,86.00%,80.00%,62.50%,67.08%
|
|
|
|
+38,xLAM-7b-fc-r (FC),67.87%,74.56%,65.75%,74.25%,93.75%,63.00%,66.00%,92.00%,78.00%,54.00%,84.50%,99.00%,70.00%,90.00%,66.00%,22.50%,49.58%
|
|
|
|
+39,GPT-3.5-Turbo-0125 (Prompt),67.78%,65.04%,67.68%,62.67%,78.00%,48.00%,62.00%,83.00%,65.50%,49.00%,46.21%,91.00%,1.43%,90.00%,72.00%,62.50%,79.17%
|
|
|
|
+40,GPT-4o-2024-08-06 (Prompt),63.57%,49.35%,69.93%,32.42%,66.25%,11.00%,20.00%,48.00%,74.00%,43.00%,49.71%,88.00%,11.43%,82.00%,78.00%,70.00%,95.00%
|
|
|
|
+41,Claude-3.5-Sonnet-20240620 (Prompt),61.29%,60.58%,54.20%,50.33%,94.00%,31.00%,26.00%,88.00%,43.50%,60.50%,66.79%,75.00%,58.57%,52.00%,38.00%,60.00%,92.50%
|
|
|
|
+42,Meta-Llama-3-8B-Instruct (Prompt),58.99%,61.15%,66.70%,63.08%,88.25%,49.00%,52.00%,85.50%,52.00%,44.00%,83.29%,88.00%,78.57%,82.00%,44.00%,57.50%,19.58%
|
|
|
|
+43,Gemini-1.0-Pro-002 (Prompt),58.91%,56.29%,62.39%,42.17%,43.50%,39.00%,44.00%,51.00%,68.50%,63.50%,48.57%,70.00%,27.14%,76.00%,70.00%,55.00%,55.42%
|
|
|
|
+44,MiniCPM3-4B (FC),57.87%,63.19%,48.70%,67.75%,83.25%,54.00%,66.00%,74.00%,60.50%,50.50%,44.79%,51.00%,38.57%,50.00%,40.00%,60.00%,73.33%
|
|
|
|
+45,Gemini-1.5-Pro-002 (FC),56.71%,38.27%,69.54%,54.08%,41.25%,55.00%,66.00%,39.50%,29.50%,30.00%,69.64%,85.00%,54.29%,80.00%,76.00%,52.50%,79.17%
|
|
|
|
+46,Nexusflow-Raven-v2 (FC),55.21%,46.15%,57.86%,57.58%,37.75%,63.00%,72.00%,53.00%,34.50%,39.50%,47.43%,82.00%,12.86%,86.00%,38.00%,60.00%,80.83%
|
|
|
|
+47,Gemini-1.5-Pro-001 (FC),54.90%,31.77%,70.39%,35.58%,40.75%,24.00%,42.00%,39.50%,26.50%,25.50%,75.07%,83.00%,67.14%,80.00%,74.00%,52.50%,85.42%
|
|
|
|
+48,Qwen2-1.5B-Instruct (Prompt),53.99%,59.73%,58.52%,55.92%,79.75%,42.00%,46.00%,80.00%,55.50%,47.50%,51.07%,85.00%,17.14%,82.00%,56.00%,45.00%,12.92%
|
|
|
|
+49,Claude-3-Haiku-20240307 (Prompt),53.93%,58.21%,57.93%,76.83%,95.50%,63.00%,72.00%,93.50%,38.00%,24.50%,89.71%,98.00%,81.43%,96.00%,26.00%,20.00%,20.83%
|
|
|
|
+50,Gemini-1.5-Flash-002 (FC),53.15%,35.42%,60.84%,49.67%,39.00%,56.00%,54.00%,39.00%,24.00%,29.00%,60.86%,66.00%,55.71%,80.00%,50.00%,52.50%,93.33%
|
|
|
|
+51,Gemini-1.5-Flash-001 (FC),51.40%,33.56%,62.41%,47.25%,41.75%,54.00%,46.00%,40.00%,22.50%,24.50%,53.14%,82.00%,24.29%,76.00%,68.00%,52.50%,78.75%
|
|
|
|
+52,Gemini-1.0-Pro-002 (FC),45.85%,26.21%,58.11%,48.83%,42.50%,56.00%,48.00%,39.00%,7.50%,9.50%,76.43%,80.00%,72.86%,76.00%,60.00%,20.00%,75.42%
|
|
|
|
+53,Mistral-Medium-2312 (Prompt),45.66%,10.00%,70.12%,40.00%,0.00%,56.00%,64.00%,0.00%,0.00%,0.00%,49.00%,98.00%,0.00%,92.00%,72.00%,67.50%,90.42%
|
|
|
|
+54,Llama-3.1-8B-Instruct (FC),44.10%,47.98%,50.73%,55.42%,50.25%,56.00%,60.00%,54.00%,48.00%,34.50%,58.93%,65.00%,52.86%,58.00%,56.00%,30.00%,2.08%
|
|
|
|
+55,xLAM-1b-fc-r (FC),35.96%,39.94%,40.23%,71.25%,82.75%,59.00%,72.00%,85.50%,1.50%,1.50%,74.93%,97.00%,52.86%,86.00%,0.00%,0.00%,2.92%
|
|
|
|
+56,Llama-3.2-1B-Instruct (Prompt),23.94%,22.77%,19.11%,25.08%,53.25%,12.00%,10.00%,32.00%,24.00%,10.00%,27.93%,53.00%,2.86%,18.00%,28.00%,2.50%,47.92%
|
|
|
|
+57,Gemma-2-2b-it (Prompt),19.01%,12.19%,12.88%,7.25%,15.75%,2.00%,4.00%,41.50%,0.00%,0.00%,5.50%,11.00%,0.00%,46.00%,0.00%,0.00%,70.83%
|
|
|
|
+58,Llama-3.1-70B-Instruct (FC),15.22%,3.75%,5.50%,0.00%,0.00%,0.00%,0.00%,0.00%,6.50%,8.50%,0.00%,0.00%,0.00%,0.00%,22.00%,0.00%,100.00%
|
|
|
|
+59,Llama-3.2-3B-Instruct (FC),11.11%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%
|
|
|
|
+60,Llama-3.2-1B-Instruct (FC),10.93%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,98.33%
|