1
+ #include <stdio.h>
2
+ #include <math.h>
3
+
4
+ // Bring catboost module into the scope
5
+ #include <path_to_dir_with_header_file/c_api.h>
6
+
7
+ double sigmoid (double x ) {
8
+ return 1. / (1. + exp (- x ));
9
+ }
10
+
11
+ char * answer (bool makes_over_50k_a_year ) {
12
+ if (makes_over_50k_a_year ) {
13
+ return "makes over 50k a year" ;
14
+ } else {
15
+ return "doesn't make over 50k a year" ;
16
+ }
17
+ }
18
+
19
+ int main (int argc , const char * argv []) {
20
+ // Load "adult.cbm" model that we trained withing Jupyter Notebook
21
+ ModelCalcerHandle * modelHandle ;
22
+ modelHandle = ModelCalcerCreate ();
23
+ if (!LoadFullModelFromFile (modelHandle , "adult.cbm" )) {
24
+ printf ("LoadFullModelFromFile error message: %s\n" , GetErrorString ());
25
+ }
26
+
27
+ // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data
28
+ // from UCI Adult Dataset.
29
+
30
+ printf ("Adult dataset model metainformation\n" );
31
+
32
+ printf ("tree count: %zu\n" , GetTreeCount (modelHandle ));
33
+
34
+ // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the
35
+ // dimension of the prediction will be 1, it will return probability of the object to belong to the positive
36
+ // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see
37
+ // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive
38
+ // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of
39
+ // positive class.
40
+ //
41
+ // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of
42
+ // multiclassification, where N is a number of classes.
43
+ printf ("prediction dimension: %zu\n" , GetDimensionsCount (modelHandle ));
44
+
45
+ printf ("numeric feature count: %zu\n" , GetFloatFeaturesCount (modelHandle ));
46
+
47
+ printf ("categoric feature count: %zu\n" , GetCatFeaturesCount (modelHandle ));
48
+
49
+ // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need
50
+ // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it
51
+ // in contains human-readable description of the dataset.
52
+ //
53
+ // So the first line of test part of the dataset is:
54
+ //
55
+ // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K."
56
+ //
57
+ // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all
58
+ // "continuous" features are numeric and all other features are categoric):
59
+ //
60
+ // numericFeatures: {25, 226802, 7, 0, 0, 40}
61
+ // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"}
62
+ //
63
+ // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and
64
+ // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we
65
+ // can, but result of prediction will be garbage).
66
+ //
67
+ // Now lets run it! And let's call this person "person A", to make variable names unique.
68
+ //
69
+ // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
70
+
71
+ printf ("\n" );
72
+
73
+ float pers_a_num_feat [6 ] = {25. , 226802. , 7. , 0. , 0. , 40. };
74
+ char * pers_a_cat_feat [8 ] = {"Private" ,"11th" ,"Never-married" ,"Machine-op-inspct" ,"Own-child" ,"Black" ,"Male" ,"United-States" };
75
+
76
+ double result_a [1 ];
77
+
78
+ const float * a_num_feat_ptr = pers_a_num_feat ;
79
+ const char * * a_cat_feat_ptr = pers_a_cat_feat ;
80
+
81
+ if (!CalcModelPrediction (
82
+ modelHandle ,
83
+ 1 ,
84
+ & a_num_feat_ptr , 6 ,
85
+ & a_cat_feat_ptr , 8 ,
86
+ & result_a , 1 )
87
+ ) {
88
+ printf ("CalcModelPrediction error message: %s\n" , GetErrorString ());
89
+ }
90
+
91
+ // Since we made prediction only for one person and prediction dimension is 1, proability of person A make
92
+ // over 50K will have index 0 in `person_a_prediction`.
93
+ //
94
+ // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply
95
+ // sigmoid function.
96
+ double pers_a_makes_over_50k_prob = sigmoid (result_a [0 ]);
97
+ printf ("Person A make over 50K a year with probability %f\n" , pers_a_makes_over_50k_prob );
98
+
99
+ // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5,
100
+ // this means that our formula is optimized for this threashold, though we may change threshold to optimize some
101
+ // other metric on a different dataset, but we won't do it in this tutorial.
102
+ double classification_threshold = 0.5 ;
103
+
104
+ bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold ;
105
+ printf ("Person A %s\n" , answer (pers_a_makes_over_50k ));
106
+
107
+ // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test"
108
+ // we can find following line:
109
+ //
110
+ // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K."
111
+ //
112
+ // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county"
113
+ // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric
114
+ // features with string "nan", now, when we apply trained model we must also use "nan" for missing features.
115
+ // Lets write out feature vectors for Person B:
116
+ //
117
+ // numericFeatures = {40, 85019, 16, 0, 0, 45};
118
+ // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"};
119
+ //
120
+ // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this
121
+ // example.
122
+
123
+ printf ("\n" );
124
+
125
+ float pers_b_num_feat [w ] = {40. , 85019. , 16. , 0. , 0. , 45. };
126
+ char * pers_b_cat_feat [8 ] = {"Private" ,"Doctorate" ,"Married-civ-spouce" ,"Prof-specialty" ,"Husband" ,"Asian-Pac-Islander" ,"Male" ,"nan" };
127
+
128
+ double result_b [1 ];
129
+
130
+ const float * b_num_feat_ptr = pers_b_num_feat ;
131
+ const char * * b_cat_feat_ptr = pers_b_cat_feat ;
132
+
133
+ if (!CalcModelPrediction (
134
+ modelHandle ,
135
+ 1 ,
136
+ & b_num_feat_ptr , 6 ,
137
+ & b_cat_feat_ptr , 8 ,
138
+ & result_b , 1 )
139
+ ) {
140
+ printf ("CalcModelPrediction error message: %s\n" , GetErrorString ());
141
+ }
142
+ double pers_b_makes_over_50k_prob = sigmoid (result_b [0 ]);
143
+ bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold ;
144
+ printf ("Person B make over 50K a year with probability %f\n" , pers_b_makes_over_50k_prob );
145
+ printf ("Person B %s\n" , answer (pers_b_makes_over_50k ));
146
+
147
+ // Let's try to apply the model to Person A and Person B in one call.
148
+ printf ("\n" );
149
+
150
+ float * pers_ab_num_feat [2 ] = {pers_a_num_feat , pers_b_num_feat };
151
+ char * * pers_ab_cat_feat [2 ] = {pers_a_cat_feat , pers_b_cat_feat };
152
+
153
+ double result_ab [2 ];
154
+
155
+ const float * * ab_num_feat_ptr = (const float * * )pers_ab_num_feat ;
156
+ const char * * * ab_cat_feat_ptr = (const char * * )pers_ab_cat_feat ;
157
+
158
+ if (!CalcModelPrediction (
159
+ modelHandle ,
160
+ 2 ,
161
+ ab_num_feat_ptr , 6 ,
162
+ ab_cat_feat_ptr , 8 ,
163
+ & result_ab , 2 )
164
+ ) {
165
+ printf ("CalcModelPrediction error message: %s\n" , GetErrorString ());
166
+ }
167
+ double pers_ab_makes_over_50k_prob [2 ] = {sigmoid (result_ab [0 ]), sigmoid (result_ab [1 ])};
168
+ bool pers_ab_makes_over_50k [2 ] = {pers_ab_makes_over_50k_prob [0 ] > classification_threshold , pers_ab_makes_over_50k_prob [1 ] > classification_threshold };
169
+
170
+ printf ("Using batch interface\n" );
171
+
172
+ // Predictions should be same as above
173
+ printf ("Person A make over 50K a year with probability %f\n" , pers_ab_makes_over_50k_prob [0 ]);
174
+ printf ("Person A %s\n" , answer (pers_ab_makes_over_50k [0 ]));
175
+ printf ("Person B make over 50K a year with probability %f\n" , pers_ab_makes_over_50k_prob [1 ]);
176
+ printf ("Person B %s\n" , answer (pers_ab_makes_over_50k [1 ]));
177
+
178
+ ModelCalcerDelete (modelHandle );
179
+ return 0 ;
180
+ }
0 commit comments