Skip to content

Commit ecff26a

Browse files
committed
Implemented tutorial for Clang
1 parent 99400b8 commit ecff26a

File tree

7 files changed

+1129
-0
lines changed

7 files changed

+1129
-0
lines changed

apply_model/README.md

+6
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,9 @@
1414

1515
* [Apply CatBoost model from Rust](./rust/train_model.ipynb)
1616
* Explore how to apply CatBoost model from Rust application. If you just want to look at code snippets you can go directly to [main.rs](./rust/src/main.rs)
17+
18+
* [Apply CatBoost model from C](./clang/train_model.ipynb)
19+
* Explore how to apply CatBoost model from C application. If you just want to look at code snippets you can go directly to [main.c](./clang/src/main.c)
20+
21+
* [Apply CatBoost model from D](./dlang/train_model.ipynb)
22+
* Explore how to apply CatBoost model from D application. If you just want to look at code snippets you can go directly to [main.d](./dlang/src/main.d)

apply_model/clang/readme.md

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Apply CatBoost model from C
2+
This tutorial consists of two parts:
3+
- first part where we preprocess dataset and train the classifier model.
4+
This part can be found in [train_model.ipynb](train_model.ipynb).
5+
- second part where we load model into C application and then apply it.
6+
This part presented as a C file. At first you need to build a library, as it is suggested on [](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). To run, you can execute:
7+
* in case Linux/macOS
8+
9+
`clang <your sources and options> -L<path_to_dir_with_libcatboostmodel> -lcatboostmodel`
10+
* in case Windows
11+
12+
`cl.exe <your sources and options> /link <path_to_dir_with_libcatboostmodel>\catboostmodel.lib`
13+
14+
If you just want to look at code snippets you can go directly to [src/main.c](src/main.c).

apply_model/clang/src/main.c

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#include <stdio.h>
2+
#include <math.h>
3+
4+
// Bring catboost module into the scope
5+
#include <path_to_dir_with_header_file/c_api.h>
6+
7+
double sigmoid(double x) {
8+
return 1. / (1. + exp(-x));
9+
}
10+
11+
char* answer(bool makes_over_50k_a_year) {
12+
if (makes_over_50k_a_year) {
13+
return "makes over 50k a year";
14+
} else {
15+
return "doesn't make over 50k a year";
16+
}
17+
}
18+
19+
int main(int argc, const char * argv[]) {
20+
// Load "adult.cbm" model that we trained withing Jupyter Notebook
21+
ModelCalcerHandle* modelHandle;
22+
modelHandle = ModelCalcerCreate();
23+
if (!LoadFullModelFromFile(modelHandle, "adult.cbm")) {
24+
printf("LoadFullModelFromFile error message: %s\n", GetErrorString());
25+
}
26+
27+
// You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data
28+
// from UCI Adult Dataset.
29+
30+
printf("Adult dataset model metainformation\n");
31+
32+
printf("tree count: %zu\n", GetTreeCount(modelHandle));
33+
34+
// In our case we were solving a binary classification problem (weather person makes over 50K a year), so the
35+
// dimension of the prediction will be 1, it will return probability of the object to belong to the positive
36+
// class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see
37+
// `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive
38+
// class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of
39+
// positive class.
40+
//
41+
// For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of
42+
// multiclassification, where N is a number of classes.
43+
printf("prediction dimension: %zu\n", GetDimensionsCount(modelHandle));
44+
45+
printf("numeric feature count: %zu\n", GetFloatFeaturesCount(modelHandle));
46+
47+
printf("categoric feature count: %zu\n", GetCatFeaturesCount(modelHandle));
48+
49+
// Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need
50+
// to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it
51+
// in contains human-readable description of the dataset.
52+
//
53+
// So the first line of test part of the dataset is:
54+
//
55+
// "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K."
56+
//
57+
// Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all
58+
// "continuous" features are numeric and all other features are categoric):
59+
//
60+
// numericFeatures: {25, 226802, 7, 0, 0, 40}
61+
// categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"}
62+
//
63+
// And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and
64+
// in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we
65+
// can, but result of prediction will be garbage).
66+
//
67+
// Now lets run it! And let's call this person "person A", to make variable names unique.
68+
//
69+
// [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
70+
71+
printf("\n");
72+
73+
float pers_a_num_feat[6] = {25., 226802., 7., 0., 0., 40.};
74+
char* pers_a_cat_feat[8] = {"Private","11th","Never-married","Machine-op-inspct","Own-child","Black","Male","United-States"};
75+
76+
double result_a[1];
77+
78+
const float* a_num_feat_ptr = pers_a_num_feat;
79+
const char** a_cat_feat_ptr = pers_a_cat_feat;
80+
81+
if (!CalcModelPrediction(
82+
modelHandle,
83+
1,
84+
&a_num_feat_ptr, 6,
85+
&a_cat_feat_ptr, 8,
86+
&result_a, 1)
87+
) {
88+
printf("CalcModelPrediction error message: %s\n", GetErrorString());
89+
}
90+
91+
// Since we made prediction only for one person and prediction dimension is 1, proability of person A make
92+
// over 50K will have index 0 in `person_a_prediction`.
93+
//
94+
// CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply
95+
// sigmoid function.
96+
double pers_a_makes_over_50k_prob = sigmoid(result_a[0]);
97+
printf("Person A make over 50K a year with probability %f\n", pers_a_makes_over_50k_prob);
98+
99+
// When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5,
100+
// this means that our formula is optimized for this threashold, though we may change threshold to optimize some
101+
// other metric on a different dataset, but we won't do it in this tutorial.
102+
double classification_threshold = 0.5;
103+
104+
bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold;
105+
printf("Person A %s\n", answer(pers_a_makes_over_50k));
106+
107+
// Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test"
108+
// we can find following line:
109+
//
110+
// "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K."
111+
//
112+
// Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county"
113+
// feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric
114+
// features with string "nan", now, when we apply trained model we must also use "nan" for missing features.
115+
// Lets write out feature vectors for Person B:
116+
//
117+
// numericFeatures = {40, 85019, 16, 0, 0, 45};
118+
// categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"};
119+
//
120+
// And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this
121+
// example.
122+
123+
printf("\n");
124+
125+
float pers_b_num_feat[w] = {40., 85019., 16., 0., 0., 45.};
126+
char* pers_b_cat_feat[8] = {"Private","Doctorate","Married-civ-spouce","Prof-specialty","Husband","Asian-Pac-Islander","Male","nan"};
127+
128+
double result_b[1];
129+
130+
const float* b_num_feat_ptr = pers_b_num_feat;
131+
const char** b_cat_feat_ptr = pers_b_cat_feat;
132+
133+
if (!CalcModelPrediction(
134+
modelHandle,
135+
1,
136+
&b_num_feat_ptr, 6,
137+
&b_cat_feat_ptr, 8,
138+
&result_b, 1)
139+
) {
140+
printf("CalcModelPrediction error message: %s\n", GetErrorString());
141+
}
142+
double pers_b_makes_over_50k_prob = sigmoid(result_b[0]);
143+
bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold;
144+
printf("Person B make over 50K a year with probability %f\n", pers_b_makes_over_50k_prob);
145+
printf("Person B %s\n", answer(pers_b_makes_over_50k));
146+
147+
// Let's try to apply the model to Person A and Person B in one call.
148+
printf("\n");
149+
150+
float* pers_ab_num_feat[2] = {pers_a_num_feat, pers_b_num_feat};
151+
char** pers_ab_cat_feat[2] = {pers_a_cat_feat, pers_b_cat_feat};
152+
153+
double result_ab[2];
154+
155+
const float** ab_num_feat_ptr = (const float**)pers_ab_num_feat;
156+
const char*** ab_cat_feat_ptr = (const char**)pers_ab_cat_feat;
157+
158+
if (!CalcModelPrediction(
159+
modelHandle,
160+
2,
161+
ab_num_feat_ptr, 6,
162+
ab_cat_feat_ptr, 8,
163+
&result_ab, 2)
164+
) {
165+
printf("CalcModelPrediction error message: %s\n", GetErrorString());
166+
}
167+
double pers_ab_makes_over_50k_prob[2] = {sigmoid(result_ab[0]), sigmoid(result_ab[1])};
168+
bool pers_ab_makes_over_50k[2] = {pers_ab_makes_over_50k_prob[0] > classification_threshold, pers_ab_makes_over_50k_prob[1] > classification_threshold};
169+
170+
printf("Using batch interface\n");
171+
172+
// Predictions should be same as above
173+
printf("Person A make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[0]);
174+
printf("Person A %s\n", answer(pers_ab_makes_over_50k[0]));
175+
printf("Person B make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[1]);
176+
printf("Person B %s\n", answer(pers_ab_makes_over_50k[1]));
177+
178+
ModelCalcerDelete(modelHandle);
179+
return 0;
180+
}

0 commit comments

Comments
 (0)