Skip to content

Commit d181607

Browse files
Performance benchmark data (#134)
* Added rand-int and rand-uniform * Added random-int and random-unif * Added random-exp + clean up * Bug fixes for random number generation + partition-by for writer * Cleaner data generation script * Fixed lint error * Added codecov yml * Added public token * Take down code-cov token * Renamed codecov * Added public token * Moved codecov token
1 parent d28dc98 commit d181607

10 files changed

+197
-8
lines changed

.github/workflows/continuous-integration.yml

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ jobs:
2828
entrypoint: lein
2929
args: coverage
3030
- uses: codecov/codecov-action@v1
31+
env:
32+
CODECOV_TOKEN: bb12d419-f860-48ff-a3bf-8fca4114cc3d
3133
with:
3234
file: target/coverage/codecov.json
3335
fail_ci_if_error: true

codecov.yml

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
codecov:
2+
require_ci_to_pass: yes
3+
4+
coverage:
5+
precision: 3
6+
round: down
7+
range: "70...100"
8+
9+
parsers:
10+
gcov:
11+
branch_detection:
12+
conditional: yes
13+
loop: yes
14+
method: no
15+
macro: no
16+
17+
comment:
18+
layout: "reach,diff,flags,tree"
19+
behavior: default
20+
require_changes: no
+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
(ns examples.performance-benchmark-data
2+
(:require
3+
[zero-one.geni.core :as g]))
4+
5+
(def data-path "/data/performance-benchmark-data")
6+
7+
(def skeleton-df
8+
(g/cache (g/table->dataset (repeat (int 2e6) [1]) [:dummy])))
9+
10+
(defn transaction-id-col []
11+
(g/concat (g/str (g/random-int))
12+
(g/lit "-")
13+
(g/str (g/random-int))
14+
(g/lit "-")
15+
(g/str (g/random-int))))
16+
17+
(def date-col
18+
(g/concat :year (g/lit "-") :month (g/lit "-") :day))
19+
20+
(def max-days {1 31
21+
2 28
22+
3 31
23+
4 30
24+
5 31
25+
6 30
26+
7 31
27+
8 31
28+
9 30
29+
10 31
30+
11 30
31+
12 30})
32+
33+
(doall
34+
(for [month (range 1 13)]
35+
(-> skeleton-df
36+
(g/select
37+
{:trx-id (transaction-id-col)
38+
:member-id (g/int (g/rexp 5e-6))
39+
:quantity (g/int (g/inc (g/rexp)))
40+
:price (g/pow 2 (g/random-int 16 20))
41+
:style-id (g/int (g/rexp 1e-2))
42+
:brand-id (g/int (g/rexp 1e-2))
43+
:year 2019
44+
:month month
45+
:day (g/random-int 1 (inc (max-days month)))})
46+
(g/with-column :date (g/to-date date-col))
47+
(g/coalesce 1)
48+
(g/write-parquet! data-path {:mode "append"}))))
49+
50+
(comment
51+
52+
(-> (g/read-parquet! data-path)
53+
(g/group-by :trx-id)
54+
g/count
55+
(g/describe :count)
56+
g/show)
57+
58+
true)

src/zero_one/geni/core.clj

+11
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
partition-by
5050
pos?
5151
rand
52+
rand-int
5253
rand-nth
5354
remove
5455
rename-keys
@@ -149,6 +150,7 @@
149150
(import-vars
150151
[zero-one.geni.sql
151152
!
153+
**
152154
->date-col
153155
->timestamp-col
154156
->utc-timestamp
@@ -565,6 +567,15 @@
565567
nunique
566568
qcut
567569
random-choice
570+
random-exp
571+
random-int
572+
random-norm
573+
random-uniform
574+
rchoice
575+
rexp
576+
rnorm
577+
runif
578+
runiform
568579
select-columns
569580
shape
570581
value-counts])

src/zero_one/geni/data_sources.clj

+10-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
(:refer-clojure :exclude [partition-by sort-by])
33
(:require
44
[camel-snake-kebab.core :refer [->camelCase]]
5-
[zero-one.geni.defaults])
5+
[zero-one.geni.defaults]
6+
[zero-one.geni.utils :refer [ensure-coll]])
67
(:import
78
(org.apache.spark.sql SparkSession)))
89

@@ -82,15 +83,21 @@
8283
configured-reader (configure-reader-or-writer unconfigured-reader options)]
8384
(.load configured-reader))))
8485

86+
(defn- partition-by-arg [partition-id]
87+
(into-array java.lang.String (map name (ensure-coll partition-id))))
88+
8589
(defn write-data! [format dataframe path options]
8690
(let [mode (:mode options)
91+
partition-id (:partition-by options)
8792
unconfigured-writer (-> dataframe
8893
(.write)
8994
(.format format)
90-
(cond-> mode (.mode mode)))
95+
(cond-> mode (.mode mode))
96+
(cond-> partition-id
97+
(.partitionBy (partition-by-arg partition-id))))
9198
configured-writer (configure-reader-or-writer
9299
unconfigured-writer
93-
(dissoc options :mode))]
100+
(dissoc options :mode :partition-by))]
94101
(.save configured-writer path)))
95102

96103
(defn write-parquet!

src/zero_one/geni/foreign_idioms.clj

+38-2
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,52 @@
2323
(str low)
2424
(str high))))))
2525

26+
(defn random-uniform
27+
([] (random-uniform 0.0 1.0))
28+
([low high] (random-uniform low high (rand-int Integer/MAX_VALUE)))
29+
([low high seed]
30+
(let [length (Math/abs (- high low))
31+
base (min high low)]
32+
(column/+ base (column/* length (sql/rand seed))))))
33+
(def runiform random-uniform)
34+
(def runif random-uniform)
35+
36+
(defn random-norm
37+
([] (random-norm 0.0 1.0))
38+
([mu sigma] (random-norm mu sigma (rand-int Integer/MAX_VALUE)))
39+
([mu sigma seed] (column/+ mu (column/* sigma (sql/randn seed)))))
40+
(def rnorm random-norm)
41+
42+
(defn random-exp
43+
([] (random-exp 1.0))
44+
([rate] (random-exp rate (rand-int Integer/MAX_VALUE)))
45+
([rate seed] (-> (sql/rand seed)
46+
sql/log
47+
(column/* -1.0)
48+
(column// rate))))
49+
(def rexp random-exp)
50+
51+
(defn random-int
52+
([] (random-int 0 (dec Integer/MAX_VALUE)))
53+
([low high] (random-int low high (rand-int Integer/MAX_VALUE)))
54+
([low high seed]
55+
(let [length (Math/abs (- high low))
56+
base (min high low)
57+
->long #(column/cast % "long")]
58+
(column/+ (->long base) (->long (column/* length (sql/rand seed)))))))
59+
2660
(defn random-choice
2761
([choices]
2862
(let [n-choices (count choices)]
2963
(random-choice choices (take n-choices (repeat (/ 1.0 n-choices))))))
30-
([choices probs]
64+
([choices probs] (random-choice choices probs (rand-int Integer/MAX_VALUE)))
65+
([choices probs seed]
3166
(assert (and (= (count choices) (count probs))
3267
(every? pos? probs))
3368
"random-choice args must have same lengths.")
3469
(assert (< (Math/abs (- (apply + probs) 1.0)) 1e-4)
3570
"random-choice probs must some to one.")
36-
(let [rand-col (column/->column (sql/rand))
71+
(let [rand-col (column/->column (sql/rand seed))
3772
cum-probs (reductions + probs)
3873
choice-cols (map (fn [choice prob]
3974
(sql/when (column/< rand-col (+ prob 1e-6))
@@ -42,6 +77,7 @@
4277
cum-probs)]
4378
(.as (apply polymorphic/coalesce choice-cols)
4479
(format "choice(%s, %s)" (str choices) (str probs))))))
80+
(def rchoice random-choice)
4581

4682
;; Pandas
4783
(defn value-counts [dataframe]

src/zero_one/geni/main.clj

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@
3232

3333
(require '[clojure.reflect :as r])
3434
(import '(org.apache.spark.sql Dataset))
35-
(->> (r/reflect Dataset)
35+
(->> (r/reflect (.write dataframe))
3636
:members
37-
(clojure.core/filter #(= (:name %) 'toDF))
37+
(clojure.core/filter #(= (:name %) 'partitionBy))
3838
;(mapv :parameter-types)
3939
;(clojure.core/filter #(= (:name %) 'toDF))
4040
;clojure.core/sort
41-
pprint)
41+
clojure.pprint/pprint)
4242

4343
0)

src/zero_one/geni/sql.clj

+1
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@
213213
(defn log2 [expr] (functions/log2 (->column expr)))
214214
(defn pmod [left-expr right-expr] (functions/pmod (->column left-expr) (->column right-expr)))
215215
(defn pow [base exponent] (functions/pow (->column base) (->column exponent)))
216+
(def ** pow)
216217
(defn radians [expr] (functions/radians (->column expr)))
217218
(defn rint [expr] (functions/rint (->column expr)))
218219
(defn round [expr] (functions/round (->column expr)))

test/zero_one/geni/data_sources_test.clj

+9
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,12 @@
118118
:url (str "jdbc:sqlite:" temp-file)
119119
:dbtable "housing"}))]
120120
(g/collect-vals write-df) => (g/collect-vals read-df)))
121+
122+
(fact "Can write parquet with :partition-by option" :slow
123+
(let [temp-file (.toString (create-temp-file! ".parquet"))
124+
read-df (do (g/write-parquet!
125+
write-df
126+
temp-file
127+
{:mode "overwrite" :partition-by [:Method]})
128+
(g/read-parquet! temp-file))]
129+
(set (g/collect write-df)) => (set (g/collect read-df))))

test/zero_one/geni/numpy_test.clj

+45
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,51 @@
44
[zero-one.geni.core :as g]
55
[zero-one.geni.test-resources :refer [df-20]]))
66

7+
(defn descriptive-stats [col]
8+
(-> (g/table->dataset (mapv vector (range 200)) [:idx])
9+
(g/with-column :x col)
10+
(g/agg {:min (g/min :x)
11+
:mean (g/mean :x)
12+
:std (g/stddev :x)
13+
:max (g/max :x)})
14+
g/first))
15+
16+
(fact "On random-exp" :slow
17+
(descriptive-stats (g/random-exp)) => #(and (< 0.5 (:mean %) 1.5)
18+
(< 0.5 (:std %) 1.5))
19+
(descriptive-stats (g/random-exp 5)) => #(and (< 0.1 (:mean %) 0.3)
20+
(< 0.1 (:std %) 0.3)))
21+
22+
(fact "On random-norm" :slow
23+
(descriptive-stats (g/random-norm)) => #(and (< -0.3 (:mean %) 0.3)
24+
(< -0.8 (:std %) 1.2))
25+
(descriptive-stats (g/random-norm -3 2)) => #(and (< -4.0 (:mean %) -2.0)
26+
(< 1.5 (:std %) 2.5)))
27+
28+
(fact "On random-int" :slow
29+
(descriptive-stats (g/random-int)) => #(and (pos? (:max %))
30+
(pos? (:min %))
31+
(integer? (:max %))
32+
(integer? (:min %)))
33+
(descriptive-stats (g/random-int 1 13)) => #(and (= (:max %) 12)
34+
(= (:min %) 1)
35+
(integer? (:max %))
36+
(integer? (:min %)))
37+
(descriptive-stats (g/random-int -5 -2)) => #(and (= (:max %) -3)
38+
(= (:min %) -5)
39+
(integer? (:max %))
40+
(integer? (:min %))))
41+
42+
(fact "On random-uniform" :slow
43+
(descriptive-stats (g/random-uniform)) => #(and (< 0.95 (:max %) 1.00)
44+
(< 0.00 (:min %) 0.05)
45+
(double? (:max %))
46+
(double? (:min %)))
47+
(descriptive-stats (g/random-uniform -0.5 -1.0)) => #(and (< -0.55 (:max %) -0.50)
48+
(< -1.00 (:min %) -0.95)
49+
(double? (:max %))
50+
(double? (:min %))))
51+
752
(fact "On random-choice" :slow
853
(-> (g/table->dataset (mapv vector (range 100)) [:idx])
954
(g/with-column :rand-choice (g/random-choice [(g/lit "abc")

0 commit comments

Comments
 (0)