diff --git a/data.generated.js b/data.generated.js index 095877d16..55f1aa833 100644 --- a/data.generated.js +++ b/data.generated.js @@ -192,6 +192,7 @@ const data = [ ,{"system":"Databricks","date":"2025-11-13","machine":"Databricks: Small","cluster_size":4,"proprietary":"yes","tuned":"no","tags":["managed","column-oriented"],"load_time":67.284,"data_size":10219802927,"result":[[0.572,0.302,0.29],[1.314,0.379,0.375],[0.706,0.519,0.395],[0.782,0.421,0.398],[0.764,0.724,0.747],[1.166,0.925,0.935],[0.819,0.464,0.43],[0.36,0.331,0.355],[1.333,1.048,1.041],[1.345,1.356,1.228],[0.74,0.506,0.49],[0.678,0.501,0.499],[0.906,0.85,0.83],[1.089,1.048,1.104],[1.177,0.917,1.026],[0.944,0.861,0.743],[1.648,1.871,1.624],[1.293,1.176,1.197],[3.115,2.425,3.603],[0.318,0.247,0.241],[1.389,0.789,0.826],[0.958,0.88,0.854],[1.686,1.259,1.474],[4.003,2.139,2.323],[0.659,0.681,0.707],[1.053,0.458,0.563],[0.548,0.896,0.493],[0.904,0.888,0.847],[6.125,5.512,5.46],[0.684,0.798,0.65],[0.818,0.72,0.682],[0.916,0.965,0.731],[3.057,2.656,2.175],[3.599,3.405,3.362],[3.623,3.872,3.713],[0.872,0.891,0.892],[0.59,0.639,0.557],[0.351,0.358,0.36],[0.538,0.376,0.42],[1.138,0.856,0.845],[0.444,0.376,0.317],[0.377,0.338,0.32],[0.377,0.359,0.335]],"source":"databricks/results/small.json"} ,{"system":"Databricks","date":"2025-11-13","machine":"Databricks: X-Large","cluster_size":32,"proprietary":"yes","tuned":"no","tags":["managed","column-oriented"],"load_time":40.675,"data_size":10219802927,"result":[[0.571,0.315,0.27],[2.524,0.445,0.371],[0.524,0.361,0.339],[0.565,0.323,0.337],[0.62,0.587,0.64],[0.798,0.608,0.762],[0.628,0.437,0.423],[0.362,0.326,0.332],[4.545,0.993,0.954],[1.319,1.239,1.038],[0.632,0.444,0.447],[0.611,0.485,0.507],[0.732,0.621,0.593],[0.675,0.657,0.637],[0.691,0.661,0.616],[0.584,0.531,0.566],[0.734,0.71,0.716],[0.546,0.544,0.499],[1.168,0.927,0.928],[0.271,0.208,0.223],[0.873,0.493,0.493],[0.592,0.51,0.503],[1.003,0.71,0.709],[1.366,0.939,1.011],[0.341,0.322,0.381],[0.304,0.292,0.301],[0.333,0.331,0.339],[0.6,0.52,0.583],[3.191,3.114,3.197],[0.553,0.498,0.493],[0.612,0.547,0.565],[0.67,0.527,0.492],[1.087,1.368,0.981],[1.575,1.544,1.519],[1.502,1.66,1.675],[0.567,0.542,0.552],[0.565,0.57,0.55],[0.365,0.366,0.377],[0.575,0.382,0.376],[1.021,0.817,0.822],[0.429,0.339,0.339],[0.354,0.335,0.346],[0.382,0.326,0.322]],"source":"databricks/results/x-large.json"} ,{"system":"Databricks","date":"2025-11-13","machine":"Databricks: X-Small","cluster_size":2,"proprietary":"yes","tuned":"no","tags":["managed","column-oriented"],"load_time":91.254,"data_size":10219802927,"result":[[0.681,0.306,0.269],[1.44,0.393,0.351],[0.653,0.492,0.438],[0.756,0.418,0.381],[1.037,0.889,0.899],[1.605,1.297,1.259],[0.718,0.459,0.432],[0.396,0.371,0.387],[1.569,1.378,1.245],[1.756,1.946,1.903],[0.871,0.591,0.646],[0.874,0.62,0.617],[1.347,1.466,1.354],[1.808,1.643,1.754],[1.656,1.571,1.542],[1.296,1.291,1.171],[3.088,2.775,2.415],[2.029,2.052,2.041],[5.499,4.354,4.611],[0.315,0.271,0.254],[2.104,1.22,1.273],[1.387,1.392,1.386],[2.775,2.336,2.22],[6.857,3.872,3.513],[0.715,0.866,0.734],[0.535,0.676,0.523],[0.681,0.755,0.723],[1.45,1.337,1.267],[10.987,11.04,10.956],[0.873,0.811,0.784],[1.092,1.178,0.972],[1.425,1.512,1.398],[5.212,4.141,3.656],[5.814,5.669,6.126],[5.893,5.416,5.67],[1.278,1.239,1.263],[0.604,0.539,0.552],[0.353,0.371,0.364],[0.548,0.387,0.358],[1.115,0.894,0.87],[0.417,0.362,0.566],[0.382,0.347,0.35],[0.372,0.343,0.329]],"source":"databricks/results/x-small.json"} +,{"system":"DataFusion 47.0.0 (Parquet, partitioned)","date":"2025-11-24","machine":"repro-c6a.4xlarge","cluster_size":1,"proprietary":"no","tuned":"no","tags":["Rust","column-oriented","embedded","stateless"],"load_time":0,"data_size":14737666736,"result":[[0.108,0.032,0.031,0.063,0.017,0.018,0.061,0.018,0.017],[0.161,0.054,0.053,0.113,0.036,0.036,0.125,0.035,0.036],[0.307,0.095,0.098,0.206,0.090,0.085,0.212,0.085,0.087],[0.577,0.112,0.108,0.435,0.086,0.087,0.433,0.088,0.087],[1.160,0.769,0.757,1.030,0.847,0.858,1.084,0.830,0.832],[1.110,0.829,0.826,1.052,0.785,0.787,0.981,0.781,0.777],[0.112,0.032,0.032,0.083,0.027,0.024,0.084,0.026,0.024],[0.169,0.056,0.057,0.130,0.040,0.039,0.127,0.038,0.039],[1.099,0.931,0.914,1.034,0.924,0.910,1.088,0.905,0.886],[1.771,1.007,1.006,1.359,1.000,1.001,1.363,1.000,0.999],[0.667,0.232,0.236,0.561,0.236,0.229,0.563,0.237,0.242],[0.882,0.257,0.253,0.706,0.274,0.274,0.784,0.268,0.275],[1.204,0.839,0.833,1.079,0.862,0.872,1.087,0.848,0.863],[2.712,1.391,1.414,2.670,1.370,1.414,2.754,1.356,1.397],[1.228,0.804,0.813,1.095,0.836,0.831,1.122,0.822,0.827],[1.023,0.870,0.882,1.082,0.941,0.962,1.077,0.954,0.983],[2.751,1.688,1.681,2.671,1.691,1.729,2.632,1.691,1.708],[2.749,1.683,1.683,2.586,1.634,1.632,2.573,1.624,1.642],[5.618,3.391,3.380,5.243,3.378,3.368,5.235,3.366,3.360],[0.375,0.103,0.104,0.318,0.078,0.079,0.444,0.082,0.080],[10.142,1.119,1.114,9.956,1.042,1.061,9.830,1.057,1.077],[11.557,1.381,1.376,11.391,1.272,1.229,11.387,1.229,1.306],[22.326,2.639,2.549,22.071,2.686,2.643,22.107,2.618,2.640],[52.872,9.353,9.169,55.525,10.308,10.289,55.512,10.243,10.216],[0.390,0.155,0.165,2.819,0.447,0.447,2.844,0.450,0.428],[1.144,0.261,0.256,0.848,0.328,0.341,0.860,0.346,0.325],[0.380,0.160,0.157,2.829,0.510,0.516,2.849,0.507,0.508],[10.451,1.511,1.507,9.680,1.535,1.541,9.727,1.516,1.529],[9.596,8.827,9.053,9.868,9.241,9.625,9.862,9.483,9.565],[0.582,0.430,0.453,0.527,0.435,0.419,0.510,0.432,0.416],[3.205,0.791,0.802,2.463,0.754,0.765,2.479,0.759,0.753],[6.970,0.976,0.983,6.142,0.955,0.932,6.156,0.911,0.926],[5.111,3.477,3.508,4.616,3.391,3.372,4.613,3.375,3.432],[10.275,3.680,3.682,10.157,3.668,3.694,10.196,3.648,3.677],[10.314,3.657,3.658,10.124,3.677,3.687,10.152,3.657,3.663],[1.385,1.231,1.252,1.336,1.227,1.214,1.318,1.197,1.196],[0.357,0.141,0.134,0.291,0.112,0.115,0.293,0.105,0.099],[0.217,0.075,0.076,0.178,0.050,0.049,0.174,0.048,0.048],[0.341,0.140,0.142,0.293,0.100,0.114,0.268,0.100,0.129],[0.506,0.208,0.225,0.432,0.180,0.167,0.437,0.173,0.167],[0.199,0.071,0.075,0.175,0.044,0.052,0.177,0.041,0.045],[0.191,0.068,0.064,0.170,0.042,0.041,0.164,0.043,0.055],[0.178,0.058,0.062,0.144,0.036,0.037,0.148,0.036,0.035]],"source":"datafusion-partitioned/results/47-c6a.4xlarge.json"} ,{"system":"DataFusion (Parquet, partitioned)","date":"2025-07-10","machine":"c6a.2xlarge","cluster_size":1,"proprietary":"no","tuned":"no","tags":["Rust","column-oriented","embedded","stateless","lukewarm-cold-run"],"load_time":0,"data_size":14737666736,"result":[[0.068,0.022,0.021],[0.167,0.06,0.059],[0.362,0.144,0.147],[0.523,0.109,0.113],[1.644,1.224,1.334],[1.719,1.167,1.174],[0.13,0.037,0.038],[0.181,0.07,0.065],[1.803,1.414,1.398],[2.079,1.591,1.617],[0.875,0.396,0.381],[1.016,0.452,0.44],[1.702,1.216,1.197],[3.255,1.883,1.93],[1.629,1.124,1.237],[1.816,1.529,1.51],[3.179,2.585,2.593],[2.891,2.197,2.287],[6.073,4.78,4.877],[0.597,0.1,0.101],[9.674,1.35,1.344],[11.432,1.673,1.652],[22.163,3.015,3.05],[55.44,46.286,43.371],[2.831,0.611,0.604],[1.025,0.535,0.558],[2.845,0.724,0.724],[9.733,2.09,2.088],[19.263,18.559,18.21],[0.953,0.806,0.774],[2.548,1.265,1.166],[6.191,1.162,1.161],[5.003,4.177,4.193],[10.349,4.795,4.817],[10.307,4.831,4.884],[2.14,1.835,1.843],[0.352,0.121,0.111],[0.217,0.056,0.058],[0.328,0.11,0.109],[0.47,0.156,0.157],[0.201,0.05,0.046],[0.186,0.046,0.046],[0.174,0.041,0.044]],"source":"datafusion-partitioned/results/c6a.2xlarge.json"} ,{"system":"DataFusion (Parquet, partitioned)","date":"2025-07-10","machine":"c6a.4xlarge","cluster_size":1,"proprietary":"no","tuned":"no","tags":["Rust","column-oriented","embedded","stateless","lukewarm-cold-run"],"load_time":0,"data_size":14737666736,"result":[[0.058,0.017,0.015],[0.116,0.035,0.037],[0.2,0.084,0.088],[0.43,0.081,0.084],[1.086,0.78,0.799],[0.977,0.751,0.756],[0.086,0.026,0.026],[0.125,0.04,0.037],[1.011,0.882,0.862],[1.349,0.971,0.983],[0.565,0.231,0.24],[0.677,0.264,0.265],[1.062,0.816,0.82],[2.769,1.346,1.201],[1.135,0.792,0.78],[1.021,0.926,0.916],[2.638,1.639,1.63],[2.585,1.555,1.592],[5.159,3.238,3.24],[0.26,0.077,0.077],[10.045,1.067,1.082],[11.424,1.291,1.269],[22.117,2.487,2.511],[55.492,9.765,9.851],[2.825,0.432,0.423],[0.853,0.328,0.33],[2.837,0.508,0.504],[9.744,1.469,1.478],[9.444,9.445,9.475],[0.515,0.405,0.415],[2.433,0.729,0.735],[6.158,0.884,0.891],[4.608,3.342,3.281],[10.221,3.481,3.455],[10.145,3.486,3.46],[1.261,1.188,1.168],[0.309,0.114,0.114],[0.175,0.05,0.048],[0.313,0.099,0.117],[0.451,0.166,0.192],[0.183,0.04,0.043],[0.171,0.04,0.041],[0.143,0.035,0.037]],"source":"datafusion-partitioned/results/c6a.4xlarge.json"} ,{"system":"DataFusion (Parquet, partitioned)","date":"2025-07-11","machine":"c6a.xlarge","cluster_size":1,"proprietary":"no","tuned":"no","tags":["Rust","column-oriented","embedded","stateless","lukewarm-cold-run"],"load_time":0,"data_size":14737666736,"result":[[0.075,0.035,0.034],[0.209,0.105,0.107],[0.558,0.278,0.281],[0.681,0.201,0.209],[3.153,2.413,2.399],[2.628,2.299,2.034],[0.155,0.064,0.065],[0.244,0.143,0.137],[3.546,2.918,2.963],[4.135,3.296,3.367],[1.376,0.779,0.817],[1.548,1.001,0.951],[2.942,2.662,2.272],[4.581,3.397,3.699],[2.802,2.287,2.28],[3.964,3.285,3.753],[5.96,5.313,5.198],[4.913,4.098,4.001],[null,null,null],[0.697,0.169,0.17],[9.898,2.361,2.249],[11.36,3.659,3.492],[22.105,17.643,16.388],[56.066,49.612,48.044],[2.824,1.274,1.265],[1.471,1.07,1.149],[2.855,1.477,1.477],[9.621,4.491,4.587],[42.151,40.396,40.48],[1.704,1.498,1.511],[3.412,2.41,2.46],[6.256,2.544,2.367],[null,null,null],[null,null,22.127],[21.955,null,null],[4.232,4.072,3.842],[0.329,0.121,0.134],[0.201,0.073,0.076],[0.321,0.129,0.128],[0.479,0.214,0.185],[0.183,0.064,0.065],[0.18,0.07,0.067],[0.159,0.061,0.059]],"source":"datafusion-partitioned/results/c6a.xlarge.json"} diff --git a/datafusion-partitioned/README.md b/datafusion-partitioned/README.md index 503fa565d..0b14a8c24 100644 --- a/datafusion-partitioned/README.md +++ b/datafusion-partitioned/README.md @@ -1,6 +1,9 @@ -# DataFusion +# Apache DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check + +[Apache DataFusion]: https://arrow.apache.org/datafusion/ +[Apache Arrow]: https://arrow.apache.org/ We use parquet file here and create an external table for it; and then execute the queries. @@ -10,7 +13,7 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe 1. manually start a AWS EC2 instance - `c6a.4xlarge` - - Ubuntu 22.04 or later + - Ubuntu 24.04 or later - Root 500GB gp2 SSD - no EBS optimized - no instance store @@ -20,16 +23,16 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe 1. `vi benchmark.sh` and modify following line to target Datafusion version ```bash - git checkout 46.0.0 + git checkout 51.0.0 ``` 1. `bash benchmark.sh` +1. `./save-result.sh c6a.4xlarge` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) diff --git a/datafusion-partitioned/benchmark.sh b/datafusion-partitioned/benchmark.sh index 1c10a401d..5fa1f295c 100755 --- a/datafusion-partitioned/benchmark.sh +++ b/datafusion-partitioned/benchmark.sh @@ -11,9 +11,9 @@ sudo apt-get update -y sudo apt-get install -y gcc echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 47.0.0 +git clone https://github.com/apache/datafusion.git +cd datafusion/ +git checkout 51.0.0 CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli export PATH="`pwd`/target/release:$PATH" cd .. diff --git a/datafusion-partitioned/results/47-c6a.4xlarge.json b/datafusion-partitioned/results/47-c6a.4xlarge.json new file mode 100644 index 000000000..86560aaac --- /dev/null +++ b/datafusion-partitioned/results/47-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 47 (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "47-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.061,0.019,0.017], + [0.120,0.036,0.035], + [0.214,0.085,0.085], + [0.443,0.090,0.086], + [1.017,0.817,0.837], + [0.961,0.780,0.782], + [0.090,0.025,0.026], + [0.128,0.041,0.038], + [1.050,0.888,0.905], + [1.367,1.007,1.019], + [0.552,0.243,0.234], + [0.697,0.276,0.264], + [1.083,0.828,0.876], + [2.654,1.369,1.430], + [1.130,0.824,0.825], + [1.080,0.951,0.946], + [2.634,1.680,1.691], + [2.591,1.624,1.620], + [5.272,3.377,3.387], + [0.522,0.081,0.074], + [9.761,1.073,1.052], + [11.401,1.293,1.302], + [22.146,2.584,2.588], + [55.505,10.246,10.275], + [2.836,0.431,0.450], + [0.854,0.340,0.343], + [2.847,0.513,0.513], + [9.739,1.521,1.549], + [9.775,9.431,9.480], + [0.535,0.415,0.421], + [2.451,0.766,0.763], + [6.158,0.915,0.913], + [4.622,3.361,3.383], + [10.150,3.631,3.656], + [10.174,3.659,3.687], + [1.294,1.180,1.183], + [0.294,0.114,0.123], + [0.173,0.050,0.052], + [0.280,0.118,0.114], + [0.423,0.163,0.172], + [0.166,0.041,0.041], + [0.165,0.041,0.043], + [0.150,0.036,0.039] + ] +} diff --git a/datafusion-partitioned/results/48-c6a.4xlarge.json b/datafusion-partitioned/results/48-c6a.4xlarge.json new file mode 100644 index 000000000..ab41d098a --- /dev/null +++ b/datafusion-partitioned/results/48-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 48.0.0 (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "48-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.070,0.016,0.018], + [0.124,0.029,0.030], + [0.199,0.069,0.070], + [0.453,0.088,0.083], + [1.168,0.725,0.739], + [0.977,0.777,0.776], + [0.090,0.022,0.021], + [0.123,0.030,0.031], + [1.023,0.905,0.901], + [1.388,0.999,0.988], + [0.560,0.240,0.233], + [0.680,0.263,0.274], + [1.084,0.861,0.877], + [2.688,1.217,1.339], + [1.142,0.834,0.822], + [0.995,0.858,0.857], + [2.688,1.675,1.700], + [2.586,1.613,1.624], + [5.197,3.328,3.352], + [0.360,0.079,0.078], + [9.973,1.075,1.025], + [11.396,1.302,1.279], + [22.070,2.500,2.535], + [55.536,10.283,10.124], + [2.835,0.447,0.435], + [0.865,0.353,0.331], + [2.847,0.517,0.518], + [9.706,1.472,1.535], + [9.666,9.526,9.477], + [0.574,0.426,0.432], + [2.491,0.759,0.723], + [6.162,0.924,0.907], + [4.649,3.361,3.393], + [10.168,3.640,3.652], + [10.098,3.657,3.672], + [1.360,1.160,1.193], + [0.295,0.109,0.105], + [0.172,0.049,0.049], + [0.286,0.096,0.113], + [0.430,0.159,0.162], + [0.182,0.045,0.040], + [0.171,0.038,0.043], + [0.154,0.034,0.037] + ] +} diff --git a/datafusion-partitioned/results/49-c6a.4xlarge.json b/datafusion-partitioned/results/49-c6a.4xlarge.json new file mode 100644 index 000000000..cf7766133 --- /dev/null +++ b/datafusion-partitioned/results/49-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 49.0.0 (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "49-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.116,0.050,0.050], + [0.173,0.076,0.079], + [0.259,0.118,0.121], + [0.499,0.137,0.132], + [0.915,0.794,0.786], + [1.061,0.863,0.867], + [0.109,0.049,0.051], + [0.178,0.077,0.078], + [1.132,0.983,0.937], + [1.442,1.040,1.052], + [0.614,0.286,0.288], + [0.720,0.300,0.295], + [1.171,0.909,0.896], + [2.659,1.415,1.362], + [1.161,0.871,0.866], + [1.017,0.892,0.879], + [2.708,1.685,1.690], + [2.654,1.681,1.670], + [5.280,3.286,3.282], + [0.394,0.126,0.126], + [9.853,1.139,1.135], + [11.475,1.335,1.363], + [22.124,2.602,2.585], + [55.427,9.969,9.878], + [2.894,0.478,0.482], + [0.896,0.314,0.318], + [2.887,0.468,0.456], + [9.817,1.576,1.539], + [9.624,8.898,8.896], + [0.588,0.475,0.471], + [2.515,0.793,0.794], + [6.177,0.961,0.986], + [4.612,3.313,3.315], + [10.257,3.641,3.658], + [10.212,3.661,3.642], + [1.407,1.194,1.215], + [0.369,0.155,0.138], + [0.223,0.095,0.091], + [0.356,0.138,0.139], + [0.520,0.206,0.200], + [0.229,0.081,0.080], + [0.217,0.079,0.081], + [0.198,0.075,0.077] + ] +} diff --git a/datafusion-partitioned/results/50-c6a.4xlarge.json b/datafusion-partitioned/results/50-c6a.4xlarge.json new file mode 100644 index 000000000..816ccaf71 --- /dev/null +++ b/datafusion-partitioned/results/50-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 50.0.0 (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "50-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.106,0.027,0.027], + [0.167,0.048,0.049], + [0.275,0.089,0.088], + [0.493,0.107,0.103], + [0.972,0.755,0.778], + [1.039,0.850,0.816], + [0.099,0.027,0.027], + [0.166,0.051,0.050], + [1.094,0.944,0.880], + [1.406,1.006,0.946], + [0.599,0.230,0.232], + [0.700,0.258,0.246], + [1.115,0.843,0.847], + [2.840,1.365,1.349], + [1.170,0.817,0.840], + [1.010,0.873,0.850], + [2.713,1.633,1.648], + [2.626,1.632,1.651], + [5.101,3.230,3.270], + [0.370,0.106,0.098], + [9.907,1.099,1.101], + [11.448,1.386,1.380], + [22.140,2.564,2.553], + [52.780,9.008,8.951], + [0.395,0.151,0.155], + [0.936,0.276,0.275], + [0.399,0.152,0.158], + [9.806,1.541,1.559], + [9.669,8.939,9.169], + [0.556,0.414,0.419], + [2.517,0.760,0.758], + [6.189,0.942,0.920], + [4.573,3.275,3.246], + [10.247,3.616,3.662], + [10.224,3.584,3.625], + [1.348,1.244,1.186], + [0.335,0.115,0.129], + [0.222,0.068,0.071], + [0.325,0.135,0.132], + [0.498,0.193,0.172], + [0.209,0.057,0.057], + [0.205,0.059,0.056], + [0.183,0.052,0.052] + ] +} diff --git a/datafusion-partitioned/results/51-c6a.4xlarge.json b/datafusion-partitioned/results/51-c6a.4xlarge.json new file mode 100644 index 000000000..f34d33cef --- /dev/null +++ b/datafusion-partitioned/results/51-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 51 (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.098,0.032,0.031], + [0.144,0.054,0.053], + [0.260,0.095,0.096], + [0.644,0.111,0.111], + [1.174,0.803,0.790], + [1.139,0.835,0.828], + [0.108,0.031,0.032], + [0.164,0.056,0.055], + [1.093,0.909,0.951], + [1.756,1.004,1.020], + [0.673,0.236,0.236], + [0.840,0.256,0.251], + [1.300,0.842,0.843], + [2.702,1.346,1.356], + [1.213,0.809,0.824], + [1.063,0.882,0.872], + [2.765,1.684,1.687], + [2.743,1.675,1.671], + [5.557,3.341,3.359], + [0.353,0.103,0.100], + [10.171,1.099,1.123], + [11.557,1.368,1.334], + [22.327,2.612,2.599], + [52.202,9.208,9.074], + [0.374,0.157,0.156], + [1.120,0.251,0.252], + [0.768,0.160,0.161], + [10.076,1.479,1.506], + [9.603,8.859,9.045], + [0.569,0.444,0.424], + [3.228,0.800,0.772], + [6.972,0.984,0.960], + [5.114,3.492,3.501], + [10.275,3.631,3.636], + [10.212,3.625,3.617], + [1.397,1.227,1.196], + [0.339,0.139,0.133], + [0.212,0.086,0.074], + [0.353,0.139,0.137], + [0.512,0.208,0.213], + [0.199,0.069,0.070], + [0.189,0.066,0.065], + [0.173,0.057,0.058] + ] +} diff --git a/datafusion-partitioned/results/51-c6a.4xlarge.json~ b/datafusion-partitioned/results/51-c6a.4xlarge.json~ new file mode 100644 index 000000000..aa6656302 --- /dev/null +++ b/datafusion-partitioned/results/51-c6a.4xlarge.json~ @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.098,0.032,0.031], + [0.144,0.054,0.053], + [0.260,0.095,0.096], + [0.644,0.111,0.111], + [1.174,0.803,0.790], + [1.139,0.835,0.828], + [0.108,0.031,0.032], + [0.164,0.056,0.055], + [1.093,0.909,0.951], + [1.756,1.004,1.020], + [0.673,0.236,0.236], + [0.840,0.256,0.251], + [1.300,0.842,0.843], + [2.702,1.346,1.356], + [1.213,0.809,0.824], + [1.063,0.882,0.872], + [2.765,1.684,1.687], + [2.743,1.675,1.671], + [5.557,3.341,3.359], + [0.353,0.103,0.100], + [10.171,1.099,1.123], + [11.557,1.368,1.334], + [22.327,2.612,2.599], + [52.202,9.208,9.074], + [0.374,0.157,0.156], + [1.120,0.251,0.252], + [0.768,0.160,0.161], + [10.076,1.479,1.506], + [9.603,8.859,9.045], + [0.569,0.444,0.424], + [3.228,0.800,0.772], + [6.972,0.984,0.960], + [5.114,3.492,3.501], + [10.275,3.631,3.636], + [10.212,3.625,3.617], + [1.397,1.227,1.196], + [0.339,0.139,0.133], + [0.212,0.086,0.074], + [0.353,0.139,0.137], + [0.512,0.208,0.213], + [0.199,0.069,0.070], + [0.189,0.066,0.065], + [0.173,0.057,0.058] + ] +} diff --git a/datafusion-partitioned/results/51-metadata-cache-c6a.4xlarge.json b/datafusion-partitioned/results/51-metadata-cache-c6a.4xlarge.json new file mode 100644 index 000000000..7b8d4a428 --- /dev/null +++ b/datafusion-partitioned/results/51-metadata-cache-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion 51 'metadata cache' (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-metadata-cache-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [,,,0.003,0.002,0.002], + [,,,0.053,0.024,0.024], + [,,,0.184,0.066,0.066], + [,,,0.465,0.086,0.078], + [,,,1.036,0.746,0.740], + [,,,1.006,0.792,0.794], + [,,,0.006,0.002,0.003], + [,,,0.069,0.025,0.027], + [,,,0.979,0.869,0.877], + [,,,1.621,1.001,0.958], + [,,,0.568,0.207,0.207], + [,,,0.746,0.228,0.229], + [,,,1.099,0.790,0.811], + [2.629,1.307,1.232], + [1.125,0.780,0.762], + [0.935,0.860,0.830], + [2.660,1.642,1.637], + [2.638,1.626,1.624], + [5.464,3.329,3.330], + [0.271,0.074,0.075], + [10.063,1.086,1.085], + [11.445,1.327,1.320], + [22.185,2.562,2.578], + [53.025,9.062,8.995], + [0.273,0.126,0.128], + [1.023,0.220,0.231], + [0.332,0.130,0.129], + [10.203,1.470,1.442], + [9.478,8.960,8.938], + [0.476,0.414,0.389], + [3.083,0.763,0.767], + [6.852,0.947,0.954], + [4.990,3.365,3.402], + [10.136,3.576,3.573], + [10.159,3.547,3.567], + [1.264,1.169,1.178], + [0.246,0.109,0.112], + [0.108,0.055,0.044], + [0.234,0.116,0.117], + [0.384,0.185,0.180], + [0.099,0.038,0.041], + [0.084,0.035,0.035], + [0.073,0.030,0.030] + ] +} diff --git a/datafusion-partitioned/results/51-run1-c6a.4xlarge.json b/datafusion-partitioned/results/51-run1-c6a.4xlarge.json new file mode 100644 index 000000000..04af78682 --- /dev/null +++ b/datafusion-partitioned/results/51-run1-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-run1-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.117,0.031,0.032], + [0.161,0.053,0.054], + [0.265,0.097,0.093], + [0.615,0.108,0.110], + [1.149,0.764,0.787], + [1.145,0.830,0.833], + [0.116,0.033,0.032], + [0.174,0.056,0.056], + [1.093,0.901,0.905], + [1.744,1.019,1.034], + [0.665,0.231,0.233], + [0.858,0.252,0.254], + [1.224,0.833,0.835], + [2.788,1.333,1.359], + [1.225,0.823,0.838], + [1.051,0.871,0.862], + [2.767,1.677,1.679], + [2.733,1.674,1.680], + [5.589,3.356,3.343], + [0.397,0.104,0.103], + [10.100,1.095,1.098], + [11.551,1.355,1.353], + [22.329,2.652,2.615], + [53.214,9.111,9.106], + [0.333,0.158,0.153], + [1.193,0.247,0.263], + [0.478,0.163,0.154], + [10.282,1.476,1.484], + [9.457,9.054,8.822], + [0.577,0.434,0.437], + [3.211,0.794,0.811], + [6.971,0.971,0.972], + [5.092,3.532,3.460], + [10.255,3.590,3.560], + [10.187,3.591,3.608], + [1.426,1.207,1.203], + [0.338,0.133,0.138], + [0.208,0.071,0.071], + [0.337,0.135,0.121], + [0.498,0.196,0.209], + [0.204,0.062,0.067], + [0.190,0.062,0.063], + [0.175,0.058,0.056] + ] +} diff --git a/datafusion-partitioned/results/51-run2-c6a.4xlarge.json b/datafusion-partitioned/results/51-run2-c6a.4xlarge.json new file mode 100644 index 000000000..d0d35a690 --- /dev/null +++ b/datafusion-partitioned/results/51-run2-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-run2-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.118,0.032,0.032], + [0.166,0.055,0.055], + [0.361,0.098,0.094], + [0.553,0.110,0.112], + [1.115,0.754,0.775], + [1.133,0.817,0.850], + [0.111,0.032,0.031], + [0.169,0.055,0.055], + [1.105,0.922,0.894], + [1.782,1.005,0.994], + [0.664,0.237,0.226], + [0.876,0.253,0.261], + [1.235,0.869,0.821], + [2.859,1.322,1.346], + [1.193,0.832,0.810], + [1.021,0.874,0.876], + [2.724,1.679,1.678], + [2.758,1.656,1.648], + [5.575,3.343,3.341], + [0.391,0.106,0.106], + [10.138,1.124,1.112], + [11.555,1.354,1.322], + [22.324,2.563,2.612], + [52.743,8.973,9.175], + [0.364,0.164,0.166], + [1.159,0.251,0.249], + [0.364,0.162,0.164], + [10.435,1.498,1.499], + [9.459,9.046,9.061], + [0.568,0.435,0.443], + [3.205,0.787,0.776], + [6.947,0.957,0.977], + [5.091,3.461,3.458], + [10.233,3.565,3.615], + [10.214,3.578,3.591], + [1.351,1.221,1.231], + [0.339,0.134,0.140], + [0.211,0.075,0.074], + [0.341,0.121,0.144], + [0.490,0.209,0.211], + [0.206,0.068,0.067], + [0.199,0.066,0.060], + [0.178,0.057,0.058] + ] +} diff --git a/datafusion-partitioned/results/51-run3-c6a.4xlarge.json b/datafusion-partitioned/results/51-run3-c6a.4xlarge.json new file mode 100644 index 000000000..86a948fc4 --- /dev/null +++ b/datafusion-partitioned/results/51-run3-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-run3-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.112,0.032,0.032], + [0.160,0.056,0.054], + [0.271,0.094,0.092], + [0.617,0.113,0.111], + [1.136,0.777,0.763], + [1.097,0.823,0.844], + [0.108,0.031,0.031], + [0.169,0.056,0.055], + [1.100,0.936,0.924], + [1.755,0.986,0.998], + [0.663,0.231,0.232], + [0.876,0.260,0.252], + [1.216,0.839,0.836], + [2.695,1.355,1.372], + [1.201,0.800,0.830], + [1.052,0.870,0.863], + [2.756,1.681,1.666], + [2.752,1.642,1.646], + [5.520,3.360,3.324], + [0.365,0.105,0.104], + [10.169,1.091,1.080], + [11.548,1.341,1.363], + [22.325,2.569,2.523], + [52.273,8.957,8.932], + [0.370,0.154,0.162], + [1.124,0.255,0.253], + [0.416,0.151,0.160], + [10.369,1.470,1.471], + [9.725,8.778,9.026], + [0.584,0.438,0.438], + [3.197,0.784,0.772], + [6.972,0.957,0.972], + [5.095,3.457,3.401], + [10.290,3.586,3.605], + [10.213,3.620,3.600], + [1.360,1.201,1.200], + [0.330,0.131,0.140], + [0.218,0.072,0.075], + [0.338,0.144,0.123], + [0.477,0.207,0.175], + [0.204,0.067,0.068], + [0.190,0.065,0.062], + [0.180,0.056,0.058] + ] +} diff --git a/datafusion-partitioned/results/51-run4-c6a.4xlarge.json b/datafusion-partitioned/results/51-run4-c6a.4xlarge.json new file mode 100644 index 000000000..75cf1217f --- /dev/null +++ b/datafusion-partitioned/results/51-run4-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-run4-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.117,0.032,0.032], + [0.160,0.055,0.054], + [0.500,0.103,0.096], + [0.580,0.110,0.109], + [1.133,0.780,0.762], + [1.111,0.837,0.829], + [0.111,0.031,0.031], + [0.173,0.057,0.055], + [1.084,0.928,0.915], + [1.762,1.004,0.995], + [0.857,0.231,0.243], + [0.745,0.254,0.255], + [1.233,0.854,0.862], + [2.731,1.353,1.372], + [1.210,0.807,0.799], + [1.050,0.880,0.870], + [2.760,1.682,1.686], + [2.734,1.659,1.671], + [5.612,3.362,3.342], + [0.337,0.101,0.102], + [10.192,1.107,1.097], + [11.555,1.372,1.371], + [22.324,2.544,2.570], + [53.926,9.134,9.111], + [0.414,0.157,0.152], + [1.111,0.253,0.262], + [0.352,0.156,0.163], + [10.398,1.513,1.481], + [9.630,8.995,8.946], + [0.575,0.440,0.446], + [3.218,0.808,0.792], + [6.972,0.983,0.972], + [5.088,3.442,3.461], + [10.141,3.607,3.601], + [10.260,3.624,3.647], + [1.375,1.218,1.217], + [0.332,0.141,0.122], + [0.199,0.071,0.073], + [0.320,0.149,0.143], + [0.476,0.208,0.214], + [0.197,0.068,0.062], + [0.186,0.069,0.061], + [0.175,0.059,0.057] + ] +} diff --git a/datafusion-partitioned/results/51-run5-c6a.4xlarge.json b/datafusion-partitioned/results/51-run5-c6a.4xlarge.json new file mode 100644 index 000000000..9bb8466a7 --- /dev/null +++ b/datafusion-partitioned/results/51-run5-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "51-run5-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.111,0.032,0.032], + [0.158,0.054,0.054], + [0.273,0.099,0.097], + [0.615,0.113,0.105], + [1.166,0.772,0.771], + [1.122,0.855,0.849], + [0.111,0.032,0.032], + [0.172,0.056,0.055], + [1.089,0.895,0.914], + [1.730,1.016,1.016], + [0.662,0.239,0.240], + [0.852,0.253,0.257], + [1.258,0.852,0.844], + [2.779,1.364,1.354], + [1.234,0.819,0.809], + [1.013,0.866,0.865], + [2.747,1.679,1.685], + [2.743,1.681,1.654], + [5.721,3.611,3.764], + [0.336,0.113,0.112], + [10.175,1.221,1.200], + [11.559,1.553,1.516], + [22.321,3.006,2.941], + [53.553,10.493,10.529], + [0.369,0.172,0.166], + [1.158,0.274,0.270], + [0.346,0.167,0.163], + [10.387,1.638,1.673], + [9.497,9.033,9.070], + [0.574,0.457,0.434], + [3.216,0.902,0.890], + [6.974,1.136,1.131], + [5.116,4.066,3.943], + [10.325,4.111,4.134], + [10.355,4.030,4.066], + [1.524,1.325,1.340], + [0.348,0.141,0.151], + [0.207,0.078,0.073], + [0.344,0.147,0.138], + [0.483,0.211,0.217], + [0.208,0.063,0.067], + [0.189,0.067,0.062], + [0.181,0.060,0.057] + ] +} diff --git a/datafusion-partitioned/results/main-c6a.4xlarge.json b/datafusion-partitioned/results/main-c6a.4xlarge.json new file mode 100644 index 000000000..ca27a693b --- /dev/null +++ b/datafusion-partitioned/results/main-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion main (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "main-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.111,0.032,0.032], + [0.171,0.053,0.054], + [0.358,0.098,0.098], + [0.998,0.107,0.109], + [0.997,0.764,0.783], + [1.523,0.819,0.821], + [0.115,0.031,0.032], + [0.179,0.054,0.055], + [1.114,0.926,0.886], + [1.738,1.032,1.034], + [0.662,0.233,0.234], + [0.849,0.254,0.261], + [1.226,0.838,0.864], + [2.856,1.391,1.382], + [1.172,0.821,0.822], + [1.033,0.867,0.884], + [2.753,1.699,1.693], + [2.737,1.662,1.659], + [5.572,3.329,3.366], + [0.362,0.102,0.104], + [10.167,1.114,1.095], + [11.563,1.317,1.308], + [22.317,2.505,2.583], + [52.689,9.281,9.168], + [0.610,0.157,0.164], + [0.938,0.258,0.264], + [0.398,0.156,0.159], + [10.417,1.542,1.501], + [9.721,9.091,8.881], + [0.591,0.441,0.433], + [3.211,0.762,0.770], + [6.966,0.975,0.997], + [5.092,3.487,3.467], + [10.290,3.663,3.641], + [10.246,3.642,3.616], + [1.315,1.134,1.172], + [0.351,0.138,0.142], + [0.220,0.076,0.074], + [0.354,0.141,0.138], + [0.492,0.206,0.207], + [0.208,0.075,0.067], + [0.203,0.061,0.068], + [0.191,0.057,0.058] + ] +} diff --git a/datafusion-partitioned/results/prefetch-c6a.4xlarge.json b/datafusion-partitioned/results/prefetch-c6a.4xlarge.json new file mode 100644 index 000000000..4d5ad9b1b --- /dev/null +++ b/datafusion-partitioned/results/prefetch-c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "DataFusion Prefetch (Parquet, partitioned)", + "date": "2025-11-24", + "machine": "prefetch-c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + [0.117,0.032,0.032], + [0.155,0.054,0.052], + [0.246,0.099,0.105], + [0.609,0.106,0.107], + [1.197,0.762,0.749], + [1.110,0.867,0.828], + [0.117,0.031,0.032], + [0.164,0.059,0.056], + [1.089,0.920,0.893], + [1.804,0.983,0.993], + [0.664,0.242,0.224], + [0.839,0.259,0.259], + [1.180,0.835,0.833], + [2.717,1.206,1.339], + [1.200,0.814,0.832], + [1.004,0.894,0.883], + [2.837,1.678,1.720], + [2.839,1.659,1.684], + [5.639,3.337,3.293], + [0.316,0.105,0.105], + [10.214,1.124,1.117], + [11.618,1.285,1.321], + [22.484,2.595,2.651], + [54.355,9.130,9.234], + [0.520,0.158,0.179], + [1.185,0.252,0.262], + [0.577,0.187,0.176], + [10.521,1.540,1.499], + [10.212,9.104,8.679], + [0.532,0.431,0.431], + [3.196,0.777,0.795], + [6.982,0.995,0.978], + [5.117,3.446,3.427], + [10.192,3.661,3.649], + [10.206,3.635,3.664], + [1.246,1.119,1.132], + [0.277,0.122,0.118], + [0.203,0.075,0.076], + [0.268,0.127,0.117], + [0.399,0.189,0.191], + [0.197,0.065,0.061], + [0.173,0.059,0.064], + [0.178,0.055,0.055] + ] +} diff --git a/datafusion-partitioned/save-result.sh b/datafusion-partitioned/save-result.sh new file mode 100755 index 000000000..ee9422c26 --- /dev/null +++ b/datafusion-partitioned/save-result.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./save-result.sh +# +# example (save results/c6a.4xlarge.json) +# ./save-result.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, partitioned)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14737666736, + "result": [ + $RESULT_ARRAY + ] +} +EOF \ No newline at end of file diff --git a/datafusion/README.md b/datafusion/README.md index 503fa565d..d711b4749 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -1,16 +1,19 @@ # DataFusion -DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check +[Apache DataFusion] is an extensible query execution framework, written in Rust, that uses [Apache Arrow] as its in-memory format. For more information, please check + +[Apache DataFusion]: https://arrow.apache.org/datafusion/ +[Apache Arrow]: https://arrow.apache.org/ We use parquet file here and create an external table for it; and then execute the queries. -## Generate benchmark results +## Cookbook: Generate benchmark results The benchmark should be completed in under an hour. On-demand pricing is $0.6 per hour while spot pricing is only $0.2 to $0.3 per hour (us-east-2). 1. manually start a AWS EC2 instance - `c6a.4xlarge` - - Ubuntu 22.04 or later + - Ubuntu 24.04 or later - Root 500GB gp2 SSD - no EBS optimized - no instance store @@ -20,16 +23,16 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe 1. `vi benchmark.sh` and modify following line to target Datafusion version ```bash - git checkout 46.0.0 + git checkout 51.0.0 ``` 1. `bash benchmark.sh` +1. `./save-result.sh c6a.4xlarge` ### Know Issues 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in queries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in queries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these queries in mac, you'll get some errors for queries contain binary format apache/arrow-datafusion#3050 ## Generate full human readable results (for debugging) diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index c16368cd3..68af7edef 100755 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -11,9 +11,9 @@ sudo apt-get update -y sudo apt-get install -y gcc echo "Install DataFusion main branch" -git clone https://github.com/apache/arrow-datafusion.git -cd arrow-datafusion/ -git checkout 47.0.0 +git clone https://github.com/apache/datafusion.git +cd datafusion/ +git checkout 51.0.0 CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli export PATH="`pwd`/target/release:$PATH" cd .. diff --git a/datafusion/save-result.sh b/datafusion/save-result.sh new file mode 100755 index 000000000..115b8ff5f --- /dev/null +++ b/datafusion/save-result.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# This scripts converts the raw results.csv data from `benchmark.sh` into a the +# final json format used by the benchmark dashboard. +# +# usage : ./save-result.sh +# +# example (save results/c6a.4xlarge.json) +# ./save-result.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Parquet, single)" +DATE=$(date +%Y-%m-%d) + + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + $RESULT_ARRAY + ] +} +EOF \ No newline at end of file