From ecff26aedf328622c93ffb231af95f9c84637338 Mon Sep 17 00:00:00 2001 From: Serg Gini <kornburn@gmail.com> Date: Sat, 29 Jul 2023 04:05:15 +0400 Subject: [PATCH 1/3] Implemented tutorial for Clang --- apply_model/README.md | 6 + apply_model/clang/readme.md | 14 ++ apply_model/clang/src/main.c | 180 ++++++++++++++ apply_model/clang/train_model.ipynb | 373 ++++++++++++++++++++++++++++ apply_model/dlang/readme.md | 7 + apply_model/dlang/src/main.d | 176 +++++++++++++ apply_model/dlang/train_model.ipynb | 373 ++++++++++++++++++++++++++++ 7 files changed, 1129 insertions(+) create mode 100644 apply_model/clang/readme.md create mode 100644 apply_model/clang/src/main.c create mode 100644 apply_model/clang/train_model.ipynb create mode 100644 apply_model/dlang/readme.md create mode 100644 apply_model/dlang/src/main.d create mode 100644 apply_model/dlang/train_model.ipynb diff --git a/apply_model/README.md b/apply_model/README.md index 00a098d..5ce90d5 100644 --- a/apply_model/README.md +++ b/apply_model/README.md @@ -14,3 +14,9 @@ * [Apply CatBoost model from Rust](./rust/train_model.ipynb) * Explore how to apply CatBoost model from Rust application. If you just want to look at code snippets you can go directly to [main.rs](./rust/src/main.rs) + +* [Apply CatBoost model from C](./clang/train_model.ipynb) + * Explore how to apply CatBoost model from C application. If you just want to look at code snippets you can go directly to [main.c](./clang/src/main.c) + +* [Apply CatBoost model from D](./dlang/train_model.ipynb) + * Explore how to apply CatBoost model from D application. If you just want to look at code snippets you can go directly to [main.d](./dlang/src/main.d) diff --git a/apply_model/clang/readme.md b/apply_model/clang/readme.md new file mode 100644 index 0000000..13b54a5 --- /dev/null +++ b/apply_model/clang/readme.md @@ -0,0 +1,14 @@ +# Apply CatBoost model from C +This tutorial consists of two parts: +- first part where we preprocess dataset and train the classifier model. + This part can be found in [train_model.ipynb](train_model.ipynb). +- second part where we load model into C application and then apply it. + This part presented as a C file. At first you need to build a library, as it is suggested on [](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). To run, you can execute: + * in case Linux/macOS + + `clang <your sources and options> -L<path_to_dir_with_libcatboostmodel> -lcatboostmodel` + * in case Windows + + `cl.exe <your sources and options> /link <path_to_dir_with_libcatboostmodel>\catboostmodel.lib` + + If you just want to look at code snippets you can go directly to [src/main.c](src/main.c). diff --git a/apply_model/clang/src/main.c b/apply_model/clang/src/main.c new file mode 100644 index 0000000..56c4cb2 --- /dev/null +++ b/apply_model/clang/src/main.c @@ -0,0 +1,180 @@ +#include <stdio.h> +#include <math.h> + +// Bring catboost module into the scope +#include <path_to_dir_with_header_file/c_api.h> + +double sigmoid(double x) { + return 1. / (1. + exp(-x)); +} + +char* answer(bool makes_over_50k_a_year) { + if (makes_over_50k_a_year) { + return "makes over 50k a year"; + } else { + return "doesn't make over 50k a year"; + } +} + +int main(int argc, const char * argv[]) { + // Load "adult.cbm" model that we trained withing Jupyter Notebook + ModelCalcerHandle* modelHandle; + modelHandle = ModelCalcerCreate(); + if (!LoadFullModelFromFile(modelHandle, "adult.cbm")) { + printf("LoadFullModelFromFile error message: %s\n", GetErrorString()); + } + + // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data + // from UCI Adult Dataset. + + printf("Adult dataset model metainformation\n"); + + printf("tree count: %zu\n", GetTreeCount(modelHandle)); + + // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the + // dimension of the prediction will be 1, it will return probability of the object to belong to the positive + // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see + // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive + // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of + // positive class. + // + // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of + // multiclassification, where N is a number of classes. + printf("prediction dimension: %zu\n", GetDimensionsCount(modelHandle)); + + printf("numeric feature count: %zu\n", GetFloatFeaturesCount(modelHandle)); + + printf("categoric feature count: %zu\n", GetCatFeaturesCount(modelHandle)); + + // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need + // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it + // in contains human-readable description of the dataset. + // + // So the first line of test part of the dataset is: + // + // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K." + // + // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all + // "continuous" features are numeric and all other features are categoric): + // + // numericFeatures: {25, 226802, 7, 0, 0, 40} + // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"} + // + // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and + // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we + // can, but result of prediction will be garbage). + // + // Now lets run it! And let's call this person "person A", to make variable names unique. + // + // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ + + printf("\n"); + + float pers_a_num_feat[6] = {25., 226802., 7., 0., 0., 40.}; + char* pers_a_cat_feat[8] = {"Private","11th","Never-married","Machine-op-inspct","Own-child","Black","Male","United-States"}; + + double result_a[1]; + + const float* a_num_feat_ptr = pers_a_num_feat; + const char** a_cat_feat_ptr = pers_a_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 1, + &a_num_feat_ptr, 6, + &a_cat_feat_ptr, 8, + &result_a, 1) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + + // Since we made prediction only for one person and prediction dimension is 1, proability of person A make + // over 50K will have index 0 in `person_a_prediction`. + // + // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply + // sigmoid function. + double pers_a_makes_over_50k_prob = sigmoid(result_a[0]); + printf("Person A make over 50K a year with probability %f\n", pers_a_makes_over_50k_prob); + + // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, + // this means that our formula is optimized for this threashold, though we may change threshold to optimize some + // other metric on a different dataset, but we won't do it in this tutorial. + double classification_threshold = 0.5; + + bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold; + printf("Person A %s\n", answer(pers_a_makes_over_50k)); + + // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" + // we can find following line: + // + // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K." + // + // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county" + // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric + // features with string "nan", now, when we apply trained model we must also use "nan" for missing features. + // Lets write out feature vectors for Person B: + // + // numericFeatures = {40, 85019, 16, 0, 0, 45}; + // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"}; + // + // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this + // example. + + printf("\n"); + + float pers_b_num_feat[w] = {40., 85019., 16., 0., 0., 45.}; + char* pers_b_cat_feat[8] = {"Private","Doctorate","Married-civ-spouce","Prof-specialty","Husband","Asian-Pac-Islander","Male","nan"}; + + double result_b[1]; + + const float* b_num_feat_ptr = pers_b_num_feat; + const char** b_cat_feat_ptr = pers_b_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 1, + &b_num_feat_ptr, 6, + &b_cat_feat_ptr, 8, + &result_b, 1) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + double pers_b_makes_over_50k_prob = sigmoid(result_b[0]); + bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold; + printf("Person B make over 50K a year with probability %f\n", pers_b_makes_over_50k_prob); + printf("Person B %s\n", answer(pers_b_makes_over_50k)); + + // Let's try to apply the model to Person A and Person B in one call. + printf("\n"); + + float* pers_ab_num_feat[2] = {pers_a_num_feat, pers_b_num_feat}; + char** pers_ab_cat_feat[2] = {pers_a_cat_feat, pers_b_cat_feat}; + + double result_ab[2]; + + const float** ab_num_feat_ptr = (const float**)pers_ab_num_feat; + const char*** ab_cat_feat_ptr = (const char**)pers_ab_cat_feat; + + if (!CalcModelPrediction( + modelHandle, + 2, + ab_num_feat_ptr, 6, + ab_cat_feat_ptr, 8, + &result_ab, 2) + ) { + printf("CalcModelPrediction error message: %s\n", GetErrorString()); + } + double pers_ab_makes_over_50k_prob[2] = {sigmoid(result_ab[0]), sigmoid(result_ab[1])}; + bool pers_ab_makes_over_50k[2] = {pers_ab_makes_over_50k_prob[0] > classification_threshold, pers_ab_makes_over_50k_prob[1] > classification_threshold}; + + printf("Using batch interface\n"); + + // Predictions should be same as above + printf("Person A make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[0]); + printf("Person A %s\n", answer(pers_ab_makes_over_50k[0])); + printf("Person B make over 50K a year with probability %f\n", pers_ab_makes_over_50k_prob[1]); + printf("Person B %s\n", answer(pers_ab_makes_over_50k[1])); + + ModelCalcerDelete(modelHandle); + return 0; +} \ No newline at end of file diff --git a/apply_model/clang/train_model.ipynb b/apply_model/clang/train_model.ipynb new file mode 100644 index 0000000..0d03029 --- /dev/null +++ b/apply_model/clang/train_model.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# catboost for clang tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q numpy pandas catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function, unicode_literals" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CatBoost version 0.14.2\n", + "NumPy version 1.16.3\n", + "Pandas version 0.24.2\n" + ] + } + ], + "source": [ + "import catboost as cb\n", + "import catboost.datasets as cbd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# print module versions for reproducibility\n", + "print('CatBoost version {}'.format(cb.__version__))\n", + "print('NumPy version {}'.format(np.__version__))\n", + "print('Pandas version {}'.format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", + "\n", + " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", + " (adult.test) of the dataset.\n", + "\n", + " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", + " \n" + ] + } + ], + "source": [ + "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", + "# features and also has missing features.\n", + "print(cbd.adult.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fixed_adult():\n", + " train, test = cbd.adult()\n", + " \n", + " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", + " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", + " # we have to replace them with some designated string manually. \n", + " for dataset in (train, test, ):\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + " dataset[name].fillna('nan', inplace=True)\n", + " \n", + " X_train, y_train = train.drop('income', axis=1), train.income\n", + " X_test, y_test = test.drop('income', axis=1), test.income\n", + " return X_train, y_train, X_test, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train, _, _ = get_fixed_adult()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>workclass</th>\n", + " <th>fnlwgt</th>\n", + " <th>education</th>\n", + " <th>education-num</th>\n", + " <th>marital-status</th>\n", + " <th>occupation</th>\n", + " <th>relationship</th>\n", + " <th>race</th>\n", + " <th>sex</th>\n", + " <th>capital-gain</th>\n", + " <th>capital-loss</th>\n", + " <th>hours-per-week</th>\n", + " <th>native-country</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>39.0</td>\n", + " <td>State-gov</td>\n", + " <td>77516.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Never-married</td>\n", + " <td>Adm-clerical</td>\n", + " <td>Not-in-family</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>2174.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>50.0</td>\n", + " <td>Self-emp-not-inc</td>\n", + " <td>83311.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Exec-managerial</td>\n", + " <td>Husband</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>13.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38.0</td>\n", + " <td>Private</td>\n", + " <td>215646.0</td>\n", + " <td>HS-grad</td>\n", + " <td>9.0</td>\n", + " <td>Divorced</td>\n", + " <td>Handlers-cleaners</td>\n", + " <td>Not-in-family</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>53.0</td>\n", + " <td>Private</td>\n", + " <td>234721.0</td>\n", + " <td>11th</td>\n", + " <td>7.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Handlers-cleaners</td>\n", + " <td>Husband</td>\n", + " <td>Black</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28.0</td>\n", + " <td>Private</td>\n", + " <td>338409.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Prof-specialty</td>\n", + " <td>Wife</td>\n", + " <td>Black</td>\n", + " <td>Female</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>Cuba</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", + "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", + "2 38.0 Private 215646.0 HS-grad 9.0 \n", + "3 53.0 Private 234721.0 11th 7.0 \n", + "4 28.0 Private 338409.0 Bachelors 13.0 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174.0 0.0 40.0 United-States \n", + "1 0.0 0.0 13.0 United-States \n", + "2 0.0 0.0 40.0 United-States \n", + "3 0.0 0.0 40.0 United-States \n", + "4 0.0 0.0 40.0 Cuba " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Custom metrics will not be evaluated because there are no test datasets\n" + ] + }, + { + "data": { + "text/plain": [ + "<catboost.core.CatBoostClassifier at 0x7fe0d1ac77f0>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to find out how we found these parameters check \"Simple classification \n", + "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", + "# subdirectory of tutorials\n", + "model = cb.CatBoostClassifier(\n", + " class_names=('<=50K', '>50K'),\n", + " loss_function='Logloss',\n", + " eval_metric='AUC', \n", + " custom_metric=['AUC'],\n", + " iterations=100,\n", + " random_seed=20181224,\n", + " learning_rate=0.4234185321620083, \n", + " depth=5, \n", + " l2_leaf_reg=9.464266235679002)\n", + "model.fit(\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", + " verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('adult.cbm')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "156K\tadult.cbm\r\n" + ] + } + ], + "source": [ + "!du -sh adult.cbm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We got the model, now it's time to use it via `catboost` package for C. Next part of the tutorial\n", + "will be in a C project." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/apply_model/dlang/readme.md b/apply_model/dlang/readme.md new file mode 100644 index 0000000..61cbd0c --- /dev/null +++ b/apply_model/dlang/readme.md @@ -0,0 +1,7 @@ +# Apply CatBoost model from Rust +This tutorial consists of two parts: +- first part where we preprocess dataset and train the classifier model. + This part can be found in [train_model.ipynb](train_model.ipynb). +- second part where we load model into Rust application and then apply it. + This part presented as a small Cargo project. To run, execute `cargo run --release`. + If you just want to look at code snippets you can go directly to [src/main.rs](src/main.rs). diff --git a/apply_model/dlang/src/main.d b/apply_model/dlang/src/main.d new file mode 100644 index 0000000..8a9242a --- /dev/null +++ b/apply_model/dlang/src/main.d @@ -0,0 +1,176 @@ +// Bring catboost module into the scope +use catboost; + +fn sigmoid(x: f64) -> f64 { + 1. / (1. + (-x).exp()) +} + +fn answer(makes_over_50k_a_year: bool) -> &'static str { + if makes_over_50k_a_year { + "makes over 50K a year" + } else { + "doesn't make over 50K a year" + } +} + +fn main() { + // Load "adult.cbm" model that we trained withing Jupyter Notebook + let model_path = "adult.cbm"; + let model = catboost::Model::load(model_path).unwrap(); + + // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data + // from UCI Adult Dataset. + + println!("Adult dataset model metainformation\n"); + + println!("tree count: {}", model.get_tree_count()); + + // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the + // dimension of the prediction will be 1, it will return probability of the object to belong to the positive + // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see + // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive + // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of + // positive class. + // + // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of + // multiclassification, where N is a number of classes. + println!("prediction dimension: {}", model.get_dimensions_count()); + + println!("numeric feature count: {}", model.get_float_features_count()); + + println!("categoric feature count: {}", model.get_cat_features_count()); + + // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need + // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it + // in contains human-readable description of the dataset. + // + // So the first line of test part of the dataset is: + // + // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K." + // + // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all + // "continuous" features are numeric and all other features are categoric): + // + // numericFeatures: {25, 226802, 7, 0, 0, 40} + // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"} + // + // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and + // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we + // can, but result of prediction will be garbage). + // + // Now lets run it! And let's call this person "person A", to make variable names unique. + // + // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ + + println!(); + + let person_a_numeric_features = vec![25., 226_802., 7., 0., 0., 40.]; + let person_a_categoric_features = vec![ + String::from("Private"), + String::from("11th"), + String::from("Never-married"), + String::from("Machine-op-inspct"), + String::from("Own-child"), + String::from("Black"), + String::from("Male"), + String::from("United-States"), + ]; + let person_a_prediction = model + .calc_model_prediction( + vec![person_a_numeric_features.clone()], + vec![person_a_categoric_features.clone()], + ) + .unwrap(); + + // Since we made prediction only for one person and prediction dimension is 1, proability of person A make + // over 50K will have index 0 in `person_a_prediction`. + // + // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply + // sigmoid function. + let person_a_makes_over_50k_probability = sigmoid(person_a_prediction[0]); + println!( + "Person A make over 50K a year with probability {}", + person_a_makes_over_50k_probability + ); + + // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, + // this means that our formula is optimized for this threashold, though we may change threshold to optimize some + // other metric on a different dataset, but we won't do it in this tutorial. + let classification_threshold = 0.5; + + let person_a_makes_over_50k = person_a_makes_over_50k_probability > classification_threshold; + println!("Person A {}", answer(person_a_makes_over_50k)); + + // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" + // we can find following line: + // + // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K." + // + // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county" + // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric + // features with string "nan", now, when we apply trained model we must also use "nan" for missing features. + // Lets write out feature vectors for Person B: + // + // numericFeatures = {40, 85019, 16, 0, 0, 45}; + // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"}; + // + // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this + // example. + + println!(); + + let person_b_numeric_features = vec![40., 85019., 16., 0., 0., 45.]; + let person_b_categoric_features = vec![ + String::from("Private"), + String::from("Doctorate"), + String::from("Married-civ-spouce"), + String::from("Prof-specialty"), + String::from("Husband"), + String::from("Asian-Pac-Islander"), + String::from("Male"), + String::from("nan"), + ]; + let person_b_prediction = model + .calc_model_prediction( + vec![person_b_numeric_features.clone()], + vec![person_b_categoric_features.clone()], + ) + .unwrap(); + let person_b_makes_over_50k_probability = sigmoid(person_b_prediction[0]); + let person_b_makes_over_50k = person_b_makes_over_50k_probability > classification_threshold; + println!( + "Person B make over 50K a year with probability {}", + person_b_makes_over_50k_probability + ); + println!("Person B {}", answer(person_b_makes_over_50k)); + + // Let's try to apply the model to Person A and Person B in one call. + + println!(); + + let persons_ab_numberic_features = vec![person_a_numeric_features, person_b_numeric_features]; + let persons_ab_categoric_features = vec![person_a_categoric_features, person_b_categoric_features]; + let persons_ab_predictions = model + .calc_model_prediction(persons_ab_numberic_features, persons_ab_categoric_features) + .unwrap(); + let persons_ab_make_over_50k_probabilities = + vec![sigmoid(persons_ab_predictions[0]), sigmoid(persons_ab_predictions[1])]; + let persons_ab_make_over_50k = vec![ + persons_ab_make_over_50k_probabilities[0] > classification_threshold, + persons_ab_make_over_50k_probabilities[1] > classification_threshold, + ]; + + println!("Using batch interface"); + + // Predictions should be same as above + println!( + "Person A make over 50K a year with probability {}", + persons_ab_make_over_50k_probabilities[0] + ); + println!("Person A {}", answer(persons_ab_make_over_50k[0])); + println!( + "Person B make over 50K a year with probability {}", + persons_ab_make_over_50k_probabilities[1] + ); + println!("Person B {}", answer(persons_ab_make_over_50k[1])); +} diff --git a/apply_model/dlang/train_model.ipynb b/apply_model/dlang/train_model.ipynb new file mode 100644 index 0000000..5de4826 --- /dev/null +++ b/apply_model/dlang/train_model.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# catboost for rust tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q numpy pandas catboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function, unicode_literals" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CatBoost version 0.14.2\n", + "NumPy version 1.16.3\n", + "Pandas version 0.24.2\n" + ] + } + ], + "source": [ + "import catboost as cb\n", + "import catboost.datasets as cbd\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# print module versions for reproducibility\n", + "print('CatBoost version {}'.format(cb.__version__))\n", + "print('NumPy version {}'.format(np.__version__))\n", + "print('Pandas version {}'.format(pd.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n", + "\n", + " Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n", + " (adult.test) of the dataset.\n", + "\n", + " [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n", + " \n" + ] + } + ], + "source": [ + "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n", + "# features and also has missing features.\n", + "print(cbd.adult.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fixed_adult():\n", + " train, test = cbd.adult()\n", + " \n", + " # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n", + " # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n", + " # we have to replace them with some designated string manually. \n", + " for dataset in (train, test, ):\n", + " for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n", + " dataset[name].fillna('nan', inplace=True)\n", + " \n", + " X_train, y_train = train.drop('income', axis=1), train.income\n", + " X_test, y_test = test.drop('income', axis=1), test.income\n", + " return X_train, y_train, X_test, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train, _, _ = get_fixed_adult()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>workclass</th>\n", + " <th>fnlwgt</th>\n", + " <th>education</th>\n", + " <th>education-num</th>\n", + " <th>marital-status</th>\n", + " <th>occupation</th>\n", + " <th>relationship</th>\n", + " <th>race</th>\n", + " <th>sex</th>\n", + " <th>capital-gain</th>\n", + " <th>capital-loss</th>\n", + " <th>hours-per-week</th>\n", + " <th>native-country</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>39.0</td>\n", + " <td>State-gov</td>\n", + " <td>77516.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Never-married</td>\n", + " <td>Adm-clerical</td>\n", + " <td>Not-in-family</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>2174.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>50.0</td>\n", + " <td>Self-emp-not-inc</td>\n", + " <td>83311.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Exec-managerial</td>\n", + " <td>Husband</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>13.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38.0</td>\n", + " <td>Private</td>\n", + " <td>215646.0</td>\n", + " <td>HS-grad</td>\n", + " <td>9.0</td>\n", + " <td>Divorced</td>\n", + " <td>Handlers-cleaners</td>\n", + " <td>Not-in-family</td>\n", + " <td>White</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>53.0</td>\n", + " <td>Private</td>\n", + " <td>234721.0</td>\n", + " <td>11th</td>\n", + " <td>7.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Handlers-cleaners</td>\n", + " <td>Husband</td>\n", + " <td>Black</td>\n", + " <td>Male</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>United-States</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28.0</td>\n", + " <td>Private</td>\n", + " <td>338409.0</td>\n", + " <td>Bachelors</td>\n", + " <td>13.0</td>\n", + " <td>Married-civ-spouse</td>\n", + " <td>Prof-specialty</td>\n", + " <td>Wife</td>\n", + " <td>Black</td>\n", + " <td>Female</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>40.0</td>\n", + " <td>Cuba</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age workclass fnlwgt education education-num \\\n", + "0 39.0 State-gov 77516.0 Bachelors 13.0 \n", + "1 50.0 Self-emp-not-inc 83311.0 Bachelors 13.0 \n", + "2 38.0 Private 215646.0 HS-grad 9.0 \n", + "3 53.0 Private 234721.0 11th 7.0 \n", + "4 28.0 Private 338409.0 Bachelors 13.0 \n", + "\n", + " marital-status occupation relationship race sex \\\n", + "0 Never-married Adm-clerical Not-in-family White Male \n", + "1 Married-civ-spouse Exec-managerial Husband White Male \n", + "2 Divorced Handlers-cleaners Not-in-family White Male \n", + "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", + "4 Married-civ-spouse Prof-specialty Wife Black Female \n", + "\n", + " capital-gain capital-loss hours-per-week native-country \n", + "0 2174.0 0.0 40.0 United-States \n", + "1 0.0 0.0 13.0 United-States \n", + "2 0.0 0.0 40.0 United-States \n", + "3 0.0 0.0 40.0 United-States \n", + "4 0.0 0.0 40.0 Cuba " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: Custom metrics will not be evaluated because there are no test datasets\n" + ] + }, + { + "data": { + "text/plain": [ + "<catboost.core.CatBoostClassifier at 0x7fe0d1ac77f0>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If you want to find out how we found these parameters check \"Simple classification \n", + "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n", + "# subdirectory of tutorials\n", + "model = cb.CatBoostClassifier(\n", + " class_names=('<=50K', '>50K'),\n", + " loss_function='Logloss',\n", + " eval_metric='AUC', \n", + " custom_metric=['AUC'],\n", + " iterations=100,\n", + " random_seed=20181224,\n", + " learning_rate=0.4234185321620083, \n", + " depth=5, \n", + " l2_leaf_reg=9.464266235679002)\n", + "model.fit(\n", + " cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n", + " verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('adult.cbm')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "156K\tadult.cbm\r\n" + ] + } + ], + "source": [ + "!du -sh adult.cbm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We got the model, now it's time to use it via `catboost` package for Rust. Next part of the tutorial\n", + "will be in a Cargo project." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 85f96382e6d913ab7098f25fe7194930d751e817 Mon Sep 17 00:00:00 2001 From: Serg Gini <kornburn@gmail.com> Date: Sun, 30 Jul 2023 03:37:02 +0400 Subject: [PATCH 2/3] Added dlang tutorial D language tutorial used ImportC functionality to automatically create bindings from c_api.h header file --- apply_model/clang/readme.md | 2 +- apply_model/dlang/readme.md | 25 +++- apply_model/dlang/src/lib_import.c | 1 + apply_model/dlang/src/main.d | 215 +++++++++++++++++----------- apply_model/dlang/train_model.ipynb | 6 +- 5 files changed, 154 insertions(+), 95 deletions(-) create mode 100644 apply_model/dlang/src/lib_import.c diff --git a/apply_model/clang/readme.md b/apply_model/clang/readme.md index 13b54a5..b6a819f 100644 --- a/apply_model/clang/readme.md +++ b/apply_model/clang/readme.md @@ -3,7 +3,7 @@ This tutorial consists of two parts: - first part where we preprocess dataset and train the classifier model. This part can be found in [train_model.ipynb](train_model.ipynb). - second part where we load model into C application and then apply it. - This part presented as a C file. At first you need to build a library, as it is suggested on [](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). To run, you can execute: + This part presented as a C file. At first you need to build a library, as it is suggested on [Evaluation library](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). To run, you can execute: * in case Linux/macOS `clang <your sources and options> -L<path_to_dir_with_libcatboostmodel> -lcatboostmodel` diff --git a/apply_model/dlang/readme.md b/apply_model/dlang/readme.md index 61cbd0c..f48f526 100644 --- a/apply_model/dlang/readme.md +++ b/apply_model/dlang/readme.md @@ -1,7 +1,24 @@ -# Apply CatBoost model from Rust +# Apply CatBoost model from D This tutorial consists of two parts: - first part where we preprocess dataset and train the classifier model. This part can be found in [train_model.ipynb](train_model.ipynb). -- second part where we load model into Rust application and then apply it. - This part presented as a small Cargo project. To run, execute `cargo run --release`. - If you just want to look at code snippets you can go directly to [src/main.rs](src/main.rs). +- second part where we load model into D application and then apply it. + This part presented as a D file. At first you need to build a library, as it is suggested on [Evaluation library](https://catboost.ai/en/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper). + + After that you need to generate preprocessed header file (.i) for ImportC functionality. To prepare preprocessed header you need to create a file [lib_import.c](src/lib_import.c) consists of one line + + `#include <path_to_header_file/c_api.h>` + + and use available C compiler (or built-in preprocessor in D compiler) as suggested on the page of [ImportC Documentation](https://dlang.org/spec/importc.html): + + `clang -E lib_import.c -o lib_import.i` + + To run, you can execute: + * in case Linux/macOS + + `ldc2 <your sources and options> <your_preprocessed_header_file> -L<path_to_dir_with_libcatboostmodel>/libcatboostmodel.{so/dylib}` + * in case Windows + + `ldc2.exe <your sources and options> <your_preprocessed_header_file> /link <path_to_dir_with_libcatboostmodel>\catboostmodel.{lib/dll}` + + If you just want to look at code snippets you can go directly to [src/main.d](src/main.d). diff --git a/apply_model/dlang/src/lib_import.c b/apply_model/dlang/src/lib_import.c new file mode 100644 index 0000000..c7ef90a --- /dev/null +++ b/apply_model/dlang/src/lib_import.c @@ -0,0 +1 @@ +#include </path_to_header_file/c_api.h> diff --git a/apply_model/dlang/src/main.d b/apply_model/dlang/src/main.d index 8a9242a..9463e94 100644 --- a/apply_model/dlang/src/main.d +++ b/apply_model/dlang/src/main.d @@ -1,29 +1,40 @@ +import std.stdio; +import std.math : exp; + // Bring catboost module into the scope -use catboost; +import lib_import; -fn sigmoid(x: f64) -> f64 { - 1. / (1. + (-x).exp()) +double sigmoid(double x) +{ + return 1. / (1. + exp(-x)); } -fn answer(makes_over_50k_a_year: bool) -> &'static str { - if makes_over_50k_a_year { - "makes over 50K a year" - } else { - "doesn't make over 50K a year" +string answer(bool makes_over_50k_a_year) +{ + if (makes_over_50k_a_year) + { + return "makes over 50k a year"; + } + else + { + return "doesn't make over 50k a year"; } } -fn main() { +void main(string[] args) +{ // Load "adult.cbm" model that we trained withing Jupyter Notebook - let model_path = "adult.cbm"; - let model = catboost::Model::load(model_path).unwrap(); - + ModelCalcerHandle* modelHandle = ModelCalcerCreate(); + if (!(modelHandle.LoadFullModelFromFile("adult.cbm"))) + { + writeln("LoadFullModelFromFile error message: %s", GetErrorString()); + } // You can also try to load your own model just replace "adult.cbm" with path to your model that classifies data // from UCI Adult Dataset. - println!("Adult dataset model metainformation\n"); + writeln("Adult dataset model metainformation\n"); - println!("tree count: {}", model.get_tree_count()); + writeln("tree count: ", modelHandle.GetTreeCount()); // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the // dimension of the prediction will be 1, it will return probability of the object to belong to the positive @@ -34,11 +45,11 @@ fn main() { // // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of // multiclassification, where N is a number of classes. - println!("prediction dimension: {}", model.get_dimensions_count()); + writeln("prediction dimension: ", modelHandle.GetDimensionsCount()); - println!("numeric feature count: {}", model.get_float_features_count()); + writeln("numeric feature count: ", modelHandle.GetFloatFeaturesCount()); - println!("categoric feature count: {}", model.get_cat_features_count()); + writeln("categoric feature count: ", modelHandle.GetCatFeaturesCount()); // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it @@ -62,44 +73,52 @@ fn main() { // // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ - println!(); - - let person_a_numeric_features = vec![25., 226_802., 7., 0., 0., 40.]; - let person_a_categoric_features = vec![ - String::from("Private"), - String::from("11th"), - String::from("Never-married"), - String::from("Machine-op-inspct"), - String::from("Own-child"), - String::from("Black"), - String::from("Male"), - String::from("United-States"), + writeln(); + + const(float)[6] pers_a_num_feat = [25., 226_802., 7., 0., 0., 40.]; + const(char)*[8] pers_a_cat_feat = [ + "Private".ptr, + "11th".ptr, + "Never-married".ptr, + "Machine-op-inspct".ptr, + "Own-child".ptr, + "Black".ptr, + "Male".ptr, + "United-States".ptr ]; - let person_a_prediction = model - .calc_model_prediction( - vec![person_a_numeric_features.clone()], - vec![person_a_categoric_features.clone()], - ) - .unwrap(); + + double[1] result_a; + + auto a_num_feat_ptr = pers_a_num_feat.ptr; + auto a_cat_feat_ptr = pers_a_cat_feat.ptr; + + if (!modelHandle.CalcModelPrediction( + 1, + &a_num_feat_ptr, 6, + &a_cat_feat_ptr, 8, + result_a.ptr, 1)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } // Since we made prediction only for one person and prediction dimension is 1, proability of person A make // over 50K will have index 0 in `person_a_prediction`. // // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply // sigmoid function. - let person_a_makes_over_50k_probability = sigmoid(person_a_prediction[0]); - println!( - "Person A make over 50K a year with probability {}", - person_a_makes_over_50k_probability + double pers_a_makes_over_50k_prob = sigmoid(result_a[0]); + writeln( + "Person A make over 50K a year with probability ", + pers_a_makes_over_50k_prob ); // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5, // this means that our formula is optimized for this threashold, though we may change threshold to optimize some // other metric on a different dataset, but we won't do it in this tutorial. - let classification_threshold = 0.5; + double classification_threshold = 0.5; - let person_a_makes_over_50k = person_a_makes_over_50k_probability > classification_threshold; - println!("Person A {}", answer(person_a_makes_over_50k)); + bool pers_a_makes_over_50k = pers_a_makes_over_50k_prob > classification_threshold; + writeln("Person A ", answer(pers_a_makes_over_50k)); // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test" // we can find following line: @@ -117,60 +136,82 @@ fn main() { // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this // example. - println!(); - - let person_b_numeric_features = vec![40., 85019., 16., 0., 0., 45.]; - let person_b_categoric_features = vec![ - String::from("Private"), - String::from("Doctorate"), - String::from("Married-civ-spouce"), - String::from("Prof-specialty"), - String::from("Husband"), - String::from("Asian-Pac-Islander"), - String::from("Male"), - String::from("nan"), + writeln(); + + const(float)[6] pers_b_num_feat = [40., 85_019., 16., 0., 0., 45.]; + const(char)*[8] pers_b_cat_feat = [ + "Private".ptr, + "Doctorate".ptr, + "Married-civ-spouce".ptr, + "Prof-specialty".ptr, + "Husband".ptr, + "Asian-Pac-Islander".ptr, + "Male".ptr, + "nan".ptr ]; - let person_b_prediction = model - .calc_model_prediction( - vec![person_b_numeric_features.clone()], - vec![person_b_categoric_features.clone()], - ) - .unwrap(); - let person_b_makes_over_50k_probability = sigmoid(person_b_prediction[0]); - let person_b_makes_over_50k = person_b_makes_over_50k_probability > classification_threshold; - println!( - "Person B make over 50K a year with probability {}", - person_b_makes_over_50k_probability + + double[1] result_b; + + auto b_num_feat_ptr = pers_b_num_feat.ptr; + auto b_cat_feat_ptr = pers_b_cat_feat.ptr; + + if (!modelHandle.CalcModelPrediction( + 1, + &b_num_feat_ptr, 6, + &b_cat_feat_ptr, 8, + result_b.ptr, 1)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } + + double pers_b_makes_over_50k_prob = sigmoid(result_b[0]); + bool pers_b_makes_over_50k = pers_b_makes_over_50k_prob > classification_threshold; + writeln( + "Person B make over 50K a year with probability ", + pers_b_makes_over_50k_prob ); - println!("Person B {}", answer(person_b_makes_over_50k)); + writeln("Person B ", answer(pers_b_makes_over_50k)); // Let's try to apply the model to Person A and Person B in one call. - println!(); - - let persons_ab_numberic_features = vec![person_a_numeric_features, person_b_numeric_features]; - let persons_ab_categoric_features = vec![person_a_categoric_features, person_b_categoric_features]; - let persons_ab_predictions = model - .calc_model_prediction(persons_ab_numberic_features, persons_ab_categoric_features) - .unwrap(); - let persons_ab_make_over_50k_probabilities = - vec![sigmoid(persons_ab_predictions[0]), sigmoid(persons_ab_predictions[1])]; - let persons_ab_make_over_50k = vec![ - persons_ab_make_over_50k_probabilities[0] > classification_threshold, - persons_ab_make_over_50k_probabilities[1] > classification_threshold, + writeln(); + + const(float)*[2] pers_ab_num_feat = cast(const(float)*[2])[pers_a_num_feat, pers_b_num_feat]; + const(char)**[2] pers_ab_cat_feat = cast(const(char)**[2])[pers_a_cat_feat, pers_b_cat_feat]; + + double[2] result_ab; + + auto ab_num_feat_ptr = cast(const(float)**)pers_ab_num_feat; + auto ab_cat_feat_ptr = cast(const(char)***)pers_ab_cat_feat; + + if (!modelHandle.CalcModelPrediction( + 2, + ab_num_feat_ptr, 6, + ab_cat_feat_ptr, 8, + result_ab.ptr, 2)) + { + writeln("CalcModelPrediction error message: ", GetErrorString()); + } + + double[2] pers_ab_makes_over_50k_prob = [sigmoid(result_ab[0]), sigmoid(result_ab[1])]; + bool[2] pers_ab_makes_over_50k = [ + pers_ab_makes_over_50k_prob[0] > classification_threshold, + pers_ab_makes_over_50k_prob[1] > classification_threshold ]; - println!("Using batch interface"); + writeln("Using batch interface"); // Predictions should be same as above - println!( - "Person A make over 50K a year with probability {}", - persons_ab_make_over_50k_probabilities[0] + writeln( + "Person A make over 50K a year with probability ", + pers_ab_makes_over_50k_prob[0] ); - println!("Person A {}", answer(persons_ab_make_over_50k[0])); - println!( - "Person B make over 50K a year with probability {}", - persons_ab_make_over_50k_probabilities[1] + writeln("Person A ", answer(pers_ab_makes_over_50k[0])); + writeln( + "Person B make over 50K a year with probability ", + pers_ab_makes_over_50k_prob[1] ); - println!("Person B {}", answer(persons_ab_make_over_50k[1])); + writeln("Person B ", answer(pers_ab_makes_over_50k[1])); + + modelHandle.ModelCalcerDelete(); } diff --git a/apply_model/dlang/train_model.ipynb b/apply_model/dlang/train_model.ipynb index 5de4826..10f9ca1 100644 --- a/apply_model/dlang/train_model.ipynb +++ b/apply_model/dlang/train_model.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# catboost for rust tutorial" + "# catboost for dlang tutorial" ] }, { @@ -344,8 +344,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We got the model, now it's time to use it via `catboost` package for Rust. Next part of the tutorial\n", - "will be in a Cargo project." + "We got the model, now it's time to use it via `catboost` package for D. Next part of the tutorial\n", + "will be in a D project." ] } ], From 3c8a3ee90a4bf6f5024aea2b50c079b2ea31ddcb Mon Sep 17 00:00:00 2001 From: Serg Gini <kornburn@gmail.com> Date: Sun, 30 Jul 2023 21:53:11 +0400 Subject: [PATCH 3/3] Update main.d Fixed strings literals, because in D they are already 0-terminated --- apply_model/dlang/src/main.d | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/apply_model/dlang/src/main.d b/apply_model/dlang/src/main.d index 9463e94..39a1e81 100644 --- a/apply_model/dlang/src/main.d +++ b/apply_model/dlang/src/main.d @@ -77,17 +77,17 @@ void main(string[] args) const(float)[6] pers_a_num_feat = [25., 226_802., 7., 0., 0., 40.]; const(char)*[8] pers_a_cat_feat = [ - "Private".ptr, - "11th".ptr, - "Never-married".ptr, - "Machine-op-inspct".ptr, - "Own-child".ptr, - "Black".ptr, - "Male".ptr, - "United-States".ptr + "Private", + "11th", + "Never-married", + "Machine-op-inspct", + "Own-child", + "Black", + "Male", + "United-States" ]; - double[1] result_a; + double[1] result_a = [0]; auto a_num_feat_ptr = pers_a_num_feat.ptr; auto a_cat_feat_ptr = pers_a_cat_feat.ptr; @@ -140,17 +140,17 @@ void main(string[] args) const(float)[6] pers_b_num_feat = [40., 85_019., 16., 0., 0., 45.]; const(char)*[8] pers_b_cat_feat = [ - "Private".ptr, - "Doctorate".ptr, - "Married-civ-spouce".ptr, - "Prof-specialty".ptr, - "Husband".ptr, - "Asian-Pac-Islander".ptr, - "Male".ptr, - "nan".ptr + "Private", + "Doctorate", + "Married-civ-spouce", + "Prof-specialty", + "Husband", + "Asian-Pac-Islander", + "Male", + "nan" ]; - double[1] result_b; + double[1] result_b = [0]; auto b_num_feat_ptr = pers_b_num_feat.ptr; auto b_cat_feat_ptr = pers_b_cat_feat.ptr; @@ -179,7 +179,7 @@ void main(string[] args) const(float)*[2] pers_ab_num_feat = cast(const(float)*[2])[pers_a_num_feat, pers_b_num_feat]; const(char)**[2] pers_ab_cat_feat = cast(const(char)**[2])[pers_a_cat_feat, pers_b_cat_feat]; - double[2] result_ab; + double[2] result_ab = [0, 0]; auto ab_num_feat_ptr = cast(const(float)**)pers_ab_num_feat; auto ab_cat_feat_ptr = cast(const(char)***)pers_ab_cat_feat;