-
Notifications
You must be signed in to change notification settings - Fork 1
Prepare v11 Support #325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Labels
Priority/0-High
To do now
Size/Days
Some days of work
Status/InDevelopment
The issue is in development by one or more team members
Milestone
Comments
Specification of the differences between the v10 and the new (v11) Core API, in a Git diff format:
def export_dictionary_as_json(
dictionary_file_path_or_domain,
json_dictionary_file_path,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
):
"""Exports a Khiops dictionary file to JSON format (``.kdicj``)
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
... :
See :ref:`core-api-common-params`.
"""
def build_dictionary_from_data_table(
data_table_path,
output_dictionary_name,
output_dictionary_file_path,
detect_format=True,
header_line=None,
field_separator=None,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Builds a dictionary file by analyzing a data table file
Parameters
----------
data_table_path : str
Path of the data table file.
output_dictionary_name : str
Name dictionary to be created.
output_dictionary_file_path : str
Path of the output dictionary file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
... :
See :ref:`core-api-common-params`.
"""
def check_database(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
max_messages=20,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Checks if a data table is compatible with a dictionary file
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary of the table to be checked.
data_table_path : str
Path of the data table file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See the ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it checks ``sample_percentage`` percent of
the data; if equal to "Exclude sample" it checks the complement of the
data selected with "Include sample". See also :ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It checks only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
max_messages : int, default 20
Maximum number of error messages to write in the log file.
... :
See :ref:`core-api-common-params`.
"""
def train_predictor(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
target_variable,
- results_dir,
+ report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=70.0,
sampling_mode="Include sample",
use_complement_as_test=True,
selection_variable="",
selection_value="",
additional_data_tables=None,
main_target_value="",
- snb_predictor=True,
- univariate_predictor_number=0,
+ keep_selected_variables_only=True,
max_evaluated_variables=0,
max_selected_variables=0,
- max_constructed_variables=100,
+ max_constructed_variables=1000,
construction_rules=None,
+ max_text_features=10000,
max_trees=10,
max_pairs=0,
all_possible_pairs=True,
specific_pairs=None,
- group_target_value=False,
+ text_features="words",
+ group_target_values=False,
discretization_method=None,
- min_interval_frequency=0,
- max_intervals=0,
grouping_method=None,
- min_group_frequency=0,
- max_groups=0,
- results_prefix="",
- batch_mode=True,
+ max_parts=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a model from a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
target_variable : str
Name of the target variable. If the specified variable is categorical it
constructs a classifier and if it is numerical a regressor. If equal to "" it
performs an unsupervised analysis.
- results_dir : str
- Path of the results directory.
+ report_file_path : str
+ Path to the analysis report file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 70.0
See the ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the predictor on ``sample_percentage``
percent of the data and tests the model on the remainder of the data if
``use_complement_as_test`` is set to ``True``. If equal to "Exclude sample" the
train and test datasets above are exchanged. See also
:ref:`core-api-sampling-mode`.
use_complement_as_test : bool, default ``True``
Uses the complement of the sampled database as test database for
computing the model's performance metrics.
- fill_test_database_settings : bool, default ``False``
- It creates a test database as the complement of the train database.
- **Deprecated** will be removed in Khiops 11, use ``use_complement_as_test``
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
main_target_value : str, default ""
If this target value is specified then it guarantees the calculation of lift
curves for it.
- snb_predictor : bool, default ``True``
- If ``True`` it trains a Selective Naive Bayes predictor. **Deprecated** will be
- removed in Khiops 11.
- univariate_predictor_number : int, default 0
- Number of univariate predictors to train.**Deprecated** will be removed in
- Khiops 11.
- map_predictor : bool, default ``False``
- If ``True`` trains a Maximum a Posteriori Naive Bayes predictor.
- **Deprecated** will be removed in Khiops Python 11.
+ keep_selected_variables_only : bool, default ``True``
+ Keeps only predictor-selected variables in the supervised analysis report.
max_evaluated_variables : int, default 0
Maximum number of variables to be evaluated in the SNB predictor training. If
equal to 0 it evaluates all informative variables.
max_selected_variables : int, default 0
Maximum number of variables to be selected in the SNB predictor. If equal to
0 it selects all the variables kept in the training.
- max_constructed_variables : int, default 100
+ max_constructed_variables : int, default 1000
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
+ max_text_features : int, default 10000
+ Maximum number of text features to construct.
max_trees : int, default 10
Maximum number of trees to construct. Not yet available in regression.
max_pairs : int, default 0
- Maximum number of variables pairs to construct.
+ Maximum number of variable pairs to construct.
specific_pairs : list of tuple, optional
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
contains only one non-empty feature name, then it generates all the pairs
containing it (within the maximum limit ``max_pairs``). These pairs have top
priority: they are constructed first.
+ text_features : str, default "words"
+ Type of the text features. Can be either one of:
+ - "words": sequences of non-space characters
+ - "ngrams": sequences of bytes
+ - "tokens": user-defined
all_possible_pairs : bool, default ``True``
If ``True`` tries to create all possible pairs within the limit ``max_pairs``.
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
- only_pairs_with : str, default ""
- Constructs only pairs with the specified variable name. If equal to the empty
- string "" it considers all variables to make pairs.
- **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
- group_target_value : bool, default ``False``
+ group_target_values : bool, default ``False``
Allows grouping of the target variable values in classification. It can
substantially increase the training time.
discretization_method : str
- Name of the discretization method. Its valid values depend on the task:
- - Supervised: "MODL" (default), "EqualWidth" or "EqualFrequency"
- - Unsupervised: "EqualWidth" (default), "EqualFrequency" or "None"
- min_interval_frequency : int, default 0
- Minimum number of instances in an interval. If equal to 0 it is
- automatically calculated. **Deprecated** will be removed in Khiops 11.
- max_intervals : int, default 0
- Maximum number of intervals to construct. If equal to 0 it is automatically
- calculated. **Deprecated** will be replaced by ``max_parts`` in Khiops 11.
+ Name of the discretization method, for unsupervised analysis only.
+ Its valid values are: "EqualWidth" (default), "EqualFrequency" or "None".
+ Ignored for supervised analysis.
grouping_method : str
- Name of the grouping method. Its valid values depend on the task:
- - Supervised: "MODL" (default) or "BasicGrouping"
- - Unsupervised: "BasicGrouping" (default) or "None"
- min_group_frequency : int, default 0
- Minimum number of instances for a group. **Deprecated** will be removed in
- Khiops 11.
- max_groups : int, default 0
- Maximum number of groups. If equal to 0 it is automatically calculated.
- **Deprecated** will be replaced by ``max_parts`` in Khiops 11.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
+ Name of the grouping method, for unsupervised analysis only.
+ Its valid values are: "BasicGrouping" (default) or "None".
+ Ignored for supervised analysis.
+ max_parts : int, default 0
+ Maximum number of parts. If equal to 0 it is automatically calculated.
... :
See :ref:`core-api-common-params`.
Returns
-------
tuple
A 2-tuple containing:
- The reports file path
- The modeling dictionary file path in the supervised case.
Raises
------
`ValueError`
Invalid values of an argument
`TypeError`
Invalid type of an argument
"""
+
+
+def interpret_predictor(
+ dictionary_file_path_or_domain,
+ predictor_dictionary_name,
+ interpretor_file_path,
+ max_variable_importances=0,
+ reinforcement_target_value="",
+ reinforcement_lever_variables=None,
+ log_file_path=None,
+ output_scenario_path=None,
+ task_file_path=None,
+ trace=False,
+ stdout_file_path="",
+ stderr_file_path="",
+ max_cores=None,
+ memory_limit_mb=None,
+ temp_dir="",
+ scenario_prologue="",
+ **kwargs,
+):
+ r"""Builds an intepretation dictionary from a predictor
+
+ Parameters
+ ----------
+ dictionary_file_path_or_domain : str or `.DictionaryDomain`
+ Path of a Khiops dictionary file or a DictionaryDomain object.
+ predictor_dictionary_name : str
+ Name of the predictor dictionary used while building the interpretation model.
+ intepretor_file_path : str
+ Path to the intepretor dictionary file.
+ max_variable_importances : int, default 0
+ Maximum number of variable importances to be selected in the intepretation
+ model. If equal to 0, then all the variables in the prediction model are
+ considered.
+ reinforcement_target_value : str, default ""
+ If this target value is specified, then its probability of occurrence is
+ tentatively increased.
+ reinforcement_lever_variables : list of str, optional
+ The names of variables to use as lever variables while building the
+ intepretation model. Min length: 0. Max length: the total number of variables
+ in the prediction model. If not specified, all variables are used.
+ """
def evaluate_predictor(
dictionary_file_path_or_domain,
train_dictionary_name,
data_table_path,
- results_dir,
+ report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
main_target_value="",
- results_prefix="",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Evaluates the predictors in a dictionary file on a database
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
train_dictionary_name : str
Name of the main dictionary used while training the models.
data_table_path : str
Path of the evaluation data table file.
- results_dir : str
- Path of the results directory.
+ report_file_path : str
+ Path to the analysis report file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it evaluates the predictor on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it evaluates the predictor on
the complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
.. note:: Use the initial dictionary name in the data paths.
main_target_value : str, default ""
If this target value is specified then it guarantees the calculation of lift
curves for it.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
... :
See :ref:`core-api-common-params`.
Returns
-------
str
The path of the JSON evaluation report (extension ``.khj``).
Raises
------
`TypeError`
Invalid type of an argument.
"""
def train_recoder(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
target_variable,
- results_dir,
+ report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
max_constructed_variables=100,
construction_rules=None,
max_trees=0,
max_pairs=0,
all_possible_pairs=True,
specific_pairs=None,
informative_variables_only=True,
max_variables=0,
keep_initial_categorical_variables=False,
keep_initial_numerical_variables=False,
categorical_recoding_method="part Id",
numerical_recoding_method="part Id",
pairs_recoding_method="part Id",
group_target_value=False,
discretization_method=None,
- min_interval_frequency=0,
- max_intervals=0,
grouping_method=None,
- min_group_frequency=0,
- max_groups=0,
- results_prefix="",
- batch_mode=True,
+ max_parts=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a recoding model from a data table
A recoding model consists in the discretization of numerical variables and the
grouping of categorical variables.
If the ``target_variable`` is specified these partitions are constructed in
supervised mode, meaning that each resulting discretizations/groupings best
separates the target variable while maintaining a simple interval/group model of the
data. Different recoding methods can be specified via the
``numerical_recoding_method``, ``categorical_recoding_method`` and
``pairs_recoding_method`` options.
The output files of this process contain a dictionary file (``.kdic``) that can be
used to recode databases with the `deploy_model` function.
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be recoded.
data_table_path : str
Path of the data table file.
target_variable : str
Name of the target variable. If equal to "" it trains an unsupervised recoder.
- results_dir : str
- Path of the results directory.
+ report_file_path : str
+ Path of the analysis report.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the recoder on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it trains the recoder on the
complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
max_constructed_variables : int, default 100
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
max_trees : int, default 0
Maximum number of trees to construct. Not yet available in regression.
max_pairs : int, default 0
Maximum number of variables pairs to construct.
specific_pairs : list of tuple, optional
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
contains only one non-empty feature name, then it generates all the pairs
containing it (within the maximum limit ``max_pairs``). These pairs have top
priority: they are constructed first.
all_possible_pairs : bool, default ``True``
If ``True`` tries to create all possible pairs within the limit ``max_pairs``.
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
- only_pairs_with : str, default ""
- Constructs only pairs with the specified variable name. If equal to the empty
- string "" it considers all variables to make pairs.
- **Deprecated** will be removed in Khiops Python 11, use ``specific_pairs``.
group_target_value : bool, default ``False``
Allows grouping of the target variable values in classification. It can
substantially increase the training time.
- discretization_method : str
- Name of the discretization method. Its valid values depend on the task:
- - Supervised: "MODL" (default), "EqualWidth" or "EqualFrequency"
- - Unsupervised: "EqualWidth" (default), "EqualFrequency" or "None"
- min_interval_frequency : int, default 0
- Minimum number of instances in an interval. If equal to 0 it is
- automatically calculated. **Deprecated** will be removed in Khiops 11.
- max_intervals : int, default 0
- Maximum number of intervals to construct. If equal to 0 it is automatically
- calculated. **Deprecated** will be replaced by ``max_parts`` in Khiops 11.
informative_variables_only : bool, default ``True``
If ``True`` keeps only informative variables.
max_variables : int, default 0
Maximum number of variables to keep. If equal to 0 keeps all variables.
keep_initial_categorical_variables : bool, default ``True``
If ``True`` keeps the initial categorical variables.
keep_initial_numerical_variables : bool, default ``True``
If ``True`` keeps initial numerical variables.
categorical_recoding_method : str
Type of recoding for categorical variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "none": Keeps the variable as-is
numerical_recoding_method : str
Type of recoding recoding for numerical variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "center-reduction": "(X - Mean(X)) / StdDev(X)"
- "0-1 normalization": "(X - Min(X)) / (Max(X) - Min(X))"
- "rank normalization": mean normalized rank (between 0 and 1) of the
instances
- "none": Keeps the variable as-is
pairs_recoding_method : str
Type of recoding for bivariate variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "none": Keeps the variable as-is
+ discretization_method : str
+ Name of the discretization method, for unsupervised analysis only.
+ Its valid values are: "EqualWidth" (default), "EqualFrequency" or "None".
+ Ignored for supervised analysis.
grouping_method : str
- Name of the grouping method. Its valid values depend on the task:
- - Supervised: "MODL" (default) or "BasicGrouping"
- - Unsupervised: "BasicGrouping" (default) or "None"
- min_group_frequency : int, default 0
- Minimum number of instances for a group. **Deprecated** will be removed in
- Khiops 11.
- max_groups : int, default 0
- Maximum number of groups. If equal to 0 it is automatically calculated.
- **Deprecated** will be replaced by ``max_parts`` in Khiops 11.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
+ Name of the grouping method, for unsupervised analysis only.
+ Its valid values are: "BasicGrouping" (default) or "None".
+ Ignored for supervised analysis.
+ max_parts : int, default 0
+ Maximum number of parts. If equal to 0 it is automatically calculated.
... :
See :ref:`core-api-common-params`.
Returns
-------
tuple
A 2-tuple containing:
- The path of the JSON file report of the process
- The path of the dictionary containing the recoding model
"""
def deploy_model(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
output_header_line=True,
output_field_separator="\t",
output_additional_data_tables=None,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Deploys a model on a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object. This file/object
defines the model to be deployed. Note that this model is not necessarily a
predictor, it can be a generic table transformation.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it deploys the model on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it deploys the model on the
complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It deploys only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
output_additional_data_tables : dict, optional
A dictionary containing the output data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of an argument.
"""
def build_deployed_dictionary(
dictionary_file_path_or_domain,
dictionary_name,
output_dictionary_file_path,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Builds a dictionary file to read the output table of a deployed model
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
output_dictionary_file_path : str
Path of the output dictionary file.
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of an argument
"""
def sort_data_table(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
sort_variables=None,
detect_format=True,
header_line=None,
field_separator=None,
output_header_line=True,
output_field_separator="\t",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Sorts a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
sort_variables : list of str, optional
The names of the variables to sort. If not set sorts the table by its key.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of a argument.
"""
def extract_keys_from_data_table(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
output_header_line=True,
output_field_separator="\t",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Extracts from data table unique occurrences of a key variable
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary of the data table.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of an argument.
+def build_multi_table_dictionary(
+ dictionary_file_path_or_domain,
+ dictionary_name,
+ output_dictionary_name,
+ output_dictionary_table_variable_name,
+ log_file_path=None,
+ output_scenario_path=None,
+ task_file_path=None,
+ trace=False,
+ stdout_file_path="",
+ stderr_file_path="",
+ max_cores=None,
+ memory_limit_mb=None,
+ temp_dir="",
+ scenario_prologue="",
+ **kwargs,
+):
+ r"""Builds a dictionary with a Table variable based on an input dictionary and adds it
+ to the input dictionary
+
+ Parameters
+ ----------
+ dictionary_file_path_or_domain : str or `.DictionaryDomain`
+ Path of a Khiops dictionary file or a DictionaryDomain object.
+ dictionary_name : str
+ Name of the dictionary of the data table. Must be a root dictionary.
+ output_dictionary_name : str
+ Name of the output dictionary. It is a main dictionary which uses
+ `dictionary_name` as secondary table and the same key as `dictionary_name`.
+ The output dictionary is added to the input dictionary file or
+ `.DictionaryDomain` object.
+ ... :
+ See :ref:`core-api-common-params`.
+
+ Raises
+ ------
+ `TypeError`
+ Invalid type of an argument.
def train_coclustering(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
coclustering_variables,
- results_dir,
+ coclustering_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
frequency_variable="",
min_optimization_time=0,
- results_prefix="",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a coclustering model from a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
coclustering_variables : list of str
The names of variables to use in coclustering. Min length: 2. Max length: 10.
- results_dir : str
- Path of the results directory.
+ coclustering_file_path : str
+ Path to the coclustering report file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the coclustering estimator on
``sample_percentage`` percent of the data. If equal to "Exclude sample" it
trains the coclustering estimator on the complement of the data selected with
"Include sample". See also :ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
frequency_variable : str, default ""
Name of frequency variable.
min_optimization_time : int, default 0
Minimum optimization time in seconds.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
... :
See :ref:`core-api-common-params`.
Returns
-------
str
The path of the of the resulting coclustering file.
Raises
------
`ValueError`
Number of coclustering variables out of the range 2-10.
`TypeError`
Invalid type of an argument.
"""
+
+
+def train_instance_variable_coclustering(
+ dictionary_file_path_or_domain,
+ dictionary_name,
+ data_table_path,
+ identifier_variable,
+ coclustering_file_path,
+ detect_format=True,
+ header_line=None,
+ field_separator=None,
+ sample_percentage=100.0,
+ sampling_mode="Include sample",
+ selection_variable="",
+ selection_value="",
+ additional_data_tables=None,
+ min_optimization_time=0,
+ log_file_path=None,
+ output_scenario_path=None,
+ task_file_path=None,
+ trace=False,
+ stdout_file_path="",
+ stderr_file_path="",
+ max_cores=None,
+ memory_limit_mb=None,
+ temp_dir="",
+ scenario_prologue="",
+ **kwargs,
+):
+ r"""Trains a coclustering model from a data table
+
+ Parameters
+ ----------
+ dictionary_file_path_or_domain : str or `.DictionaryDomain`
+ Path of a Khiops dictionary file or a DictionaryDomain object.
+ dictionary_name : str
+ Name of the dictionary to be analyzed.
+ data_table_path : str
+ Path of the data table file.
+ identifier_variable : str
+ The name of the variable used to identify the instances in instance x variable
+ coclustering.
+ coclustering_file_path : str
+ Path to the coclustering report file.
+ detect_format : bool, default ``True``
+ If ``True`` detects automatically whether the data table file has a header and
+ its field separator. It is set to ``False`` if ``header_line`` or
+ ``field_separator`` are set.
+ header_line : bool, optional (default ``True``)
+ If ``True`` it uses the first line of the data as column names. Sets
+ ``detect_format`` to ``False`` if set. Ignored if ``detect_format``
+ is ``True``.
+ field_separator : str, optional (default "\\t")
+ A field separator character. "" has the same effect as "\\t". Sets
+ ``detect_format`` to ``False`` if set. Ignored if ``detect_format``
+ is ``True``.
+ sample_percentage : float, default 100.0
+ See ``sampling_mode`` option below.
+ sampling_mode : "Include sample" or "Exclude sample"
+ If equal to "Include sample" it trains the coclustering estimator on
+ ``sample_percentage`` percent of the data. If equal to "Exclude sample" it
+ trains the coclustering estimator on the complement of the data selected with
+ "Include sample". See also :ref:`core-api-sampling-mode`.
+ selection_variable : str, default ""
+ It trains with only the records such that the value of ``selection_variable`` is
+ equal to ``selection_value``. Ignored if equal to "".
+ selection_value: str or int or float, default ""
+ See ``selection_variable`` option above. Ignored if equal to "".
+ additional_data_tables : dict, optional
+ A dictionary containing the data paths and file paths for a multi-table
+ dictionary file. For more details see :doc:`/multi_table_primer`.
+ min_optimization_time : int, default 0
+ Minimum optimization time in seconds.
+ ... :
+ See :ref:`core-api-common-params`.
+
+ Returns
+ -------
+ str
+ The path of the of the resulting coclustering file.
+
+ Raises
+ ------
+ `ValueError`
+ Number of coclustering variables out of the range 2-10.
+ `TypeError`
+ Invalid type of an argument.
+ """
def simplify_coclustering(
coclustering_file_path,
simplified_coclustering_file_path,
- results_dir,
max_preserved_information=0,
max_cells=0,
max_total_parts=0,
max_part_numbers=None,
- results_prefix="",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Simplifies a coclustering model
Parameters
----------
coclustering_file_path : str
Path of the coclustering file (extension ``.khc``, or ``.khcj``).
simplified_coclustering_file_path : str
Path of the output coclustering file.
- results_dir : str
- Path of the results directory.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0
there is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there
is no limit.
max_total_parts : int, default 0
Maximum number of parts totaled over all variables. If equal to 0 there is no
limit.
max_part_numbers : dict, optional
Dictionary that associate variable names to their maximum number of parts to
preserve in the simplified coclustering. If not set there is no limit.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of an argument.
"""
def prepare_coclustering_deployment(
dictionary_file_path_or_domain,
dictionary_name,
coclustering_file_path,
table_variable,
deployed_variable_name,
- results_dir,
+ coclustering_dictionary_file,
max_preserved_information=0,
max_cells=0,
max_part_numbers=None,
build_cluster_variable=True,
build_distance_variables=False,
build_frequency_variables=False,
variables_prefix="",
- results_prefix="",
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Prepares a *individual-variable* coclustering deployment
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
coclustering_file_path : str
Path of the coclustering model file (extension ``.khc`` or ``.khcj``).
table_variable : str
Name of the table variable in the dictionary.
deployed_variable_name : str
Name of the coclustering variable to deploy.
- results_dir : str
- Path of the results directory.
+ coclustering_dictionary_file : str
+ Path of the coclustering dictionary file for deployment.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0
there is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there
is no limit.
max_part_numbers : dict, optional
Dictionary associating variable names to their maximum number of parts to
preserve in the simplified coclustering. For variables not present in
``max_part_numbers`` there is no limit.
build_cluster_variable : bool, default ``True``
If ``True`` includes a cluster id variable in the deployment.
build_distance_variables : bool, default ``False``
If ``True`` includes a cluster distance variable in the deployment.
build_frequency_variables : bool, default ``False``
If ``True`` includes the frequency variables in the deployment.
variables_prefix : str, default ""
Prefix for the variables in the deployment dictionary.
- results_prefix : str, default ""
- Prefix of the result files. **Deprecated** will be removed in Khiops 11.
... :
See :ref:`core-api-common-params`.
Raises
------
`TypeError`
Invalid type of an argument
"""
def extract_clusters(
coclustering_file_path,
cluster_variable,
clusters_file_path,
max_preserved_information=0,
max_cells=0,
- batch_mode=True,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Extracts clusters to a tab separated (TSV) file
Parameters
----------
coclustering_file_path : str
Path of the coclustering model file (extension ``.khc`` or ``.khcj``).
cluster_variable : str
Name of the variable for which the clusters are extracted.
clusters_file_path : str
Path of the output clusters TSV file.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0 there
is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there is
no limit.
... :
See :ref:`core-api-common-params`.
"""
def detect_data_table_format(
data_table_path,
dictionary_file_path_or_domain=None,
dictionary_name=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
):
"""Detects the format of a data table
Runs an heuristic to detect the format of a data table. The detection heuristic is
more accurate if a dictionary with the table schema is provided.
Parameters
----------
data_table_path : str
Path of the data table file.
dictionary_file_path_or_domain : str or `.DictionaryDomain`, optional
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str, optional
Name of the dictionary.
... :
See :ref:`core-api-common-params`.
Returns
-------
tuple
A 2-tuple containing:
- the ``header_line`` boolean
- the ``field_separator`` character
These are exactly the parameters expected in many Khiops Python API functions.
"""
-########################
-# Deprecated functions #
-########################
-
-
-def build_multi_table_dictionary(
- dictionary_file_path_or_domain,
- root_dictionary_name,
- secondary_table_variable_name,
- output_dictionary_file_path,
- overwrite_dictionary_file=False,
- batch_mode=True,
- log_file_path=None,
- output_scenario_path=None,
- task_file_path=None,
- trace=False,
-):
- """Builds a multi-table dictionary from a dictionary with a key
-
- .. warning::
- This method is *deprecated* since Khiops 10.1.3 and will be removed in Khiops
- 11. Use the `.build_multi_table_dictionary_domain` helper function to
- the same effect.
-
- Parameters
- ----------
- dictionary_file_path_or_domain : str or `.DictionaryDomain`
- Path of a Khiops dictionary file or a `.DictionaryDomain` object.
- root_dictionary_name : str
- Name for the new root dictionary
- secondary_table_variable_name : str
- Name, in the root dictionary, for the "table" variable of the secondary table.
- output_dictionary_file_path : str
- Path of the output dictionary path.
- overwrite_dictionary_file : bool, default ``False``
- If ``True`` it will overwrite an input dictionary file.
- ... :
- See :ref:`core-api-common-params`.
-
- Raises
- ------
- `ValueError`
- Invalid values of an argument
- """
|
Provide deprecation path for
N.B. |
Updated Core API for v11: def get_khiops_version():
"""Returns the Khiops version
Returns
-------
str
The Khiops version of the current `.KhiopsRunner` backend.
"""
def get_samples_dir():
"""Returns the Khiops' *samples* directory path
Returns
-------
str
The path of the Khiops *samples* directory.
"""
def export_dictionary_as_json(
dictionary_file_path_or_domain,
json_dictionary_file_path,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
):
"""Exports a Khiops dictionary file to JSON format (``.kdicj``)
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
"""
def build_dictionary_from_data_table(
data_table_path,
output_dictionary_name,
output_dictionary_file_path,
detect_format=True,
header_line=None,
field_separator=None,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Builds a dictionary file by analyzing a data table file
Parameters
----------
data_table_path : str
Path of the data table file.
output_dictionary_name : str
Name dictionary to be created.
output_dictionary_file_path : str
Path of the output dictionary file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
"""
def check_database(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
max_messages=20,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Checks if a data table is compatible with a dictionary file
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary of the table to be checked.
data_table_path : str
Path of the data table file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See the ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it checks ``sample_percentage`` percent of
the data; if equal to "Exclude sample" it checks the complement of the
data selected with "Include sample". See also :ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It checks only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
max_messages : int, default 20
Maximum number of error messages to write in the log file.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
"""
def train_predictor(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
target_variable,
analysis_report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=70.0,
sampling_mode="Include sample",
use_complement_as_test=True,
selection_variable="",
selection_value="",
additional_data_tables=None,
do_data_preparation_only=False,
main_target_value="",
keep_selected_variables_only=True,
max_evaluated_variables=0,
max_selected_variables=0,
max_constructed_variables=1000,
construction_rules=None,
max_text_features=10000,
max_trees=10,
max_pairs=0,
all_possible_pairs=True,
specific_pairs=None,
text_features="words",
group_target_value=False,
discretization_method=None,
grouping_method=None,
max_parts=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a model from a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
target_variable : str
Name of the target variable. If the specified variable is categorical it
constructs a classifier and if it is numerical a regressor. If equal to "" it
performs an unsupervised analysis.
analysis_report_file_path : str
Path to the analysis report file in the JSON format. An additional dictionary
file with the same name and extension `.model.kdic` is built, which contains
the trained models.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 70.0
See the ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the predictor on ``sample_percentage``
percent of the data and tests the model on the remainder of the data if
``use_complement_as_test`` is set to ``True``. If equal to "Exclude sample" the
train and test datasets above are exchanged. See also
:ref:`core-api-sampling-mode`.
use_complement_as_test : bool, default ``True``
Uses the complement of the sampled database as test database for
computing the model's performance metrics.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
do_data_preparation_only : bool, default ``False``
If ``False``, it trains a Selective Naive Bayes predictor. Otherwise, only data
preparation via MODL discretization is done.
main_target_value : str, default ""
If this target value is specified then it guarantees the calculation of lift
curves for it.
keep_selected_variables_only : bool, default ``True``
Keeps only predictor-selected variables in the supervised analysis report.
max_evaluated_variables : int, default 0
Maximum number of variables to be evaluated in the SNB predictor training. If
equal to 0 it evaluates all informative variables.
max_selected_variables : int, default 0
Maximum number of variables to be selected in the SNB predictor. If equal to
0 it selects all the variables kept in the training.
max_constructed_variables : int, default 1000
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
max_text_features : int, default 10000
Maximum number of text features to construct.
max_trees : int, default 10
Maximum number of trees to construct.
max_pairs : int, default 0
Maximum number of variable pairs to construct.
specific_pairs : list of tuple, optional
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
contains only one non-empty feature name, then it generates all the pairs
containing it (within the maximum limit ``max_pairs``). These pairs have top
priority: they are constructed first.
text_features : str, default "words"
Type of the text features. Can be either one of:
- "words": sequences of non-space characters
- "ngrams": sequences of bytes
- "tokens": user-defined
all_possible_pairs : bool, default ``True``
If ``True`` tries to create all possible pairs within the limit ``max_pairs``.
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
group_target_value : bool, default ``False``
Allows grouping of the target variable values in classification. It can
substantially increase the training time.
discretization_method : str
Name of the discretization method, for unsupervised analysis only.
Its valid values are: "EqualWidth" (default), "EqualFrequency" or "None".
Ignored for supervised analysis.
grouping_method : str
Name of the grouping method, for unsupervised analysis only.
Its valid values are: "BasicGrouping" (default) or "None".
Ignored for supervised analysis.
max_parts : int, default 0
Maximum number of variable parts produced by preprocessing methods. If equal
to 0 it is automatically calculated.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
tuple
A 2-tuple containing:
- The reports file path
- The modeling dictionary file path in the supervised case.
Raises
------
`ValueError`
Invalid values of an argument
`TypeError`
Invalid type of an argument
"""
def interpret_predictor(
dictionary_file_path_or_domain,
predictor_dictionary_name,
interpretor_file_path,
max_variable_importances=0,
reinforcement_target_value="",
reinforcement_lever_variables=None,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Builds an intepretation dictionary from a predictor
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
predictor_dictionary_name : str
Name of the predictor dictionary used while building the interpretation model.
intepretor_file_path : str
Path to the intepretor dictionary file.
max_variable_importances : int, default 0
Maximum number of variable importances to be selected in the intepretation
model. If equal to 0, then all the variables in the prediction model are
considered.
reinforcement_target_value : str, default ""
If this target value is specified, then its probability of occurrence is
tentatively increased.
reinforcement_lever_variables : list of str, optional
The names of variables to use as lever variables while building the
intepretation model. Min length: 0. Max length: the total number of variables
in the prediction model. If not specified, all variables are used.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
"""
def evaluate_predictor(
dictionary_file_path_or_domain,
train_dictionary_name,
data_table_path,
evaluation_report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
main_target_value="",
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Evaluates the predictors in a dictionary file on a database
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
train_dictionary_name : str
Name of the main dictionary used while training the models.
data_table_path : str
Path of the evaluation data table file.
evaluation_report_file_path : str
Path to the evaluation report file, in the JSON format.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it evaluates the predictor on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it evaluates the predictor on
the complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
.. note:: Use the initial dictionary name in the data paths.
main_target_value : str, default ""
If this target value is specified then it guarantees the calculation of lift
curves for it.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
str
The path of the JSON evaluation report (extension ``.khj``).
Raises
------
`TypeError`
Invalid type of an argument.
"""
def train_recoder(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
target_variable,
analysis_report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
max_constructed_variables=100,
construction_rules=None,
max_trees=0,
max_pairs=0,
all_possible_pairs=True,
specific_pairs=None,
informative_variables_only=True,
max_variables=0,
keep_initial_categorical_variables=False,
keep_initial_numerical_variables=False,
categorical_recoding_method="part Id",
numerical_recoding_method="part Id",
pairs_recoding_method="part Id",
group_target_value=False,
discretization_method=None,
grouping_method=None,
max_parts=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a recoding model from a data table
A recoding model consists in the discretization of numerical variables and the
grouping of categorical variables.
If the ``target_variable`` is specified these partitions are constructed in
supervised mode, meaning that each resulting discretizations/groupings best
separates the target variable while maintaining a simple interval/group model of the
data. Different recoding methods can be specified via the
``numerical_recoding_method``, ``categorical_recoding_method`` and
``pairs_recoding_method`` options.
The output files of this process contain a dictionary file (``.kdic``) that can be
used to recode databases with the `deploy_model` function.
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be recoded.
data_table_path : str
Path of the data table file.
target_variable : str
Name of the target variable. If equal to "" it trains an unsupervised recoder.
analysis_report_file_path : str
Path to the analysis report file in the JSON format. An additional dictionary
file with the same name and extension `.model.kdic` is built, which contains
the trained recoding model.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the recoder on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it trains the recoder on the
complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
max_constructed_variables : int, default 100
Maximum number of variables to construct.
construction_rules : list of str, optional
Allowed rules for the automatic variable construction. If not set it uses all
possible rules.
max_trees : int, default 0
Maximum number of trees to construct.
max_pairs : int, default 0
Maximum number of variables pairs to construct.
specific_pairs : list of tuple, optional
User-specified pairs as a list of 2-tuples of feature names. If a given tuple
contains only one non-empty feature name, then it generates all the pairs
containing it (within the maximum limit ``max_pairs``). These pairs have top
priority: they are constructed first.
all_possible_pairs : bool, default ``True``
If ``True`` tries to create all possible pairs within the limit ``max_pairs``.
Pairs specified with ``specific_pairs`` have top priority: they are constructed
first.
group_target_value : bool, default ``False``
Allows grouping of the target variable values in classification. It can
substantially increase the training time.
informative_variables_only : bool, default ``True``
If ``True`` keeps only informative variables.
max_variables : int, default 0
Maximum number of variables to keep. If equal to 0 keeps all variables.
keep_initial_categorical_variables : bool, default ``True``
If ``True`` keeps the initial categorical variables.
keep_initial_numerical_variables : bool, default ``True``
If ``True`` keeps initial numerical variables.
categorical_recoding_method : str
Type of recoding for categorical variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "none": Keeps the variable as-is
numerical_recoding_method : str
Type of recoding recoding for numerical variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "center-reduction": "(X - Mean(X)) / StdDev(X)"
- "0-1 normalization": "(X - Min(X)) / (Max(X) - Min(X))"
- "rank normalization": mean normalized rank (between 0 and 1) of the
instances
- "none": Keeps the variable as-is
pairs_recoding_method : str
Type of recoding for bivariate variables. Types available:
- "part Id" (default): An id for the interval/group
- "part label": A label for the interval/group
- "0-1 binarization": A 0's and 1's coding the interval/group id
- "conditional info": Conditional information of the interval/group
- "none": Keeps the variable as-is
discretization_method : str
Name of the discretization method, for unsupervised analysis only.
Its valid values are: "EqualWidth" (default), "EqualFrequency" or "None".
Ignored for supervised analysis.
grouping_method : str
Name of the grouping method, for unsupervised analysis only.
Its valid values are: "BasicGrouping" (default) or "None".
Ignored for supervised analysis.
max_parts : int, default 0
Maximum number of parts. If equal to 0 it is automatically calculated.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
tuple
A 2-tuple containing:
- The path of the JSON file report of the process
- The path of the dictionary containing the recoding model
"""
def deploy_model(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
output_header_line=True,
output_field_separator="\t",
output_additional_data_tables=None,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Deploys a model on a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object. This file/object
defines the model to be deployed. Note that this model is not necessarily a
predictor, it can be a generic table transformation.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it deploys the model on ``sample_percentage``
percent of the data. If equal to "Exclude sample" it deploys the model on the
complement of the data selected with "Include sample". See also
:ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It deploys only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
output_additional_data_tables : dict, optional
A dictionary containing the output data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of an argument.
"""
def build_deployed_dictionary(
dictionary_file_path_or_domain,
dictionary_name,
output_dictionary_file_path,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Builds a dictionary file to read the output table of a deployed model
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
output_dictionary_file_path : str
Path of the output dictionary file.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of an argument
"""
def sort_data_table(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
sort_variables=None,
detect_format=True,
header_line=None,
field_separator=None,
output_header_line=True,
output_field_separator="\t",
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Sorts a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
sort_variables : list of str, optional
The names of the variables to sort. If not set sorts the table by its key.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of a argument.
"""
def extract_keys_from_data_table(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
output_data_table_path,
detect_format=True,
header_line=None,
field_separator=None,
output_header_line=True,
output_field_separator="\t",
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Extracts from data table unique occurrences of a key variable
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary of the data table.
data_table_path : str
Path of the data table file.
output_data_table_path : str
Path of the output data file.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
output_header_line : bool, default ``True``
If ``True`` writes a header line with the column names in the output table.
output_field_separator : str, default "\\t"
The field separator character for the output table ("" counts as "\\t").
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of an argument.
"""
def train_coclustering(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
coclustering_variables,
coclustering_report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
frequency_variable="",
min_optimization_time=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a coclustering model from a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
coclustering_variables : list of str
The names of variables to use in coclustering. Min length: 2. Max length: 10.
coclustering_report_file_path : str
Path to the coclustering report file in the JSON format.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the coclustering estimator on
``sample_percentage`` percent of the data. If equal to "Exclude sample" it
trains the coclustering estimator on the complement of the data selected with
"Include sample". See also :ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
frequency_variable : str, default ""
Name of frequency variable.
min_optimization_time : int, default 0
Minimum optimization time in seconds.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
str
The path of the of the resulting coclustering file.
Raises
------
`ValueError`
Number of coclustering variables out of the range 2-10.
`TypeError`
Invalid type of an argument.
"""
def train_instance_variable_coclustering(
dictionary_file_path_or_domain,
dictionary_name,
data_table_path,
identifier_variable,
coclustering_report_file_path,
detect_format=True,
header_line=None,
field_separator=None,
sample_percentage=100.0,
sampling_mode="Include sample",
selection_variable="",
selection_value="",
additional_data_tables=None,
min_optimization_time=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
r"""Trains a coclustering model from a data table
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
data_table_path : str
Path of the data table file.
identifier_variable : str
The name of the variable used to identify the instances in instance x variable
coclustering.
coclustering_report_file_path : str
Path to the coclustering report file in the JSON format.
detect_format : bool, default ``True``
If ``True`` detects automatically whether the data table file has a header and
its field separator. It is set to ``False`` if ``header_line`` or
``field_separator`` are set.
header_line : bool, optional (default ``True``)
If ``True`` it uses the first line of the data as column names. Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
field_separator : str, optional (default "\\t")
A field separator character. "" has the same effect as "\\t". Sets
``detect_format`` to ``False`` if set. Ignored if ``detect_format``
is ``True``.
sample_percentage : float, default 100.0
See ``sampling_mode`` option below.
sampling_mode : "Include sample" or "Exclude sample"
If equal to "Include sample" it trains the coclustering estimator on
``sample_percentage`` percent of the data. If equal to "Exclude sample" it
trains the coclustering estimator on the complement of the data selected with
"Include sample". See also :ref:`core-api-sampling-mode`.
selection_variable : str, default ""
It trains with only the records such that the value of ``selection_variable`` is
equal to ``selection_value``. Ignored if equal to "".
selection_value: str or int or float, default ""
See ``selection_variable`` option above. Ignored if equal to "".
additional_data_tables : dict, optional
A dictionary containing the data paths and file paths for a multi-table
dictionary file. For more details see :doc:`/multi_table_primer`.
min_optimization_time : int, default 0
Minimum optimization time in seconds.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
str
The path of the of the resulting coclustering file.
Raises
------
`ValueError`
Number of coclustering variables out of the range 2-10.
`TypeError`
Invalid type of an argument.
"""
def simplify_coclustering(
coclustering_file_path,
simplified_coclustering_file_path,
results_dir=None,
max_preserved_information=0,
max_cells=0,
max_total_parts=0,
max_part_numbers=None,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Simplifies a coclustering model
Parameters
----------
coclustering_file_path : str
Path of the coclustering file (extension ``.khc``, or ``.khcj``).
simplified_coclustering_file_path : str
Path of the output coclustering file.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0
there is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there
is no limit.
max_total_parts : int, default 0
Maximum number of parts totaled over all variables. If equal to 0 there is no
limit.
max_part_numbers : dict, optional
Dictionary that associate variable names to their maximum number of parts to
preserve in the simplified coclustering. If not set there is no limit.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of an argument.
"""
def prepare_coclustering_deployment(
dictionary_file_path_or_domain,
dictionary_name,
coclustering_file_path,
table_variable,
deployed_variable_name,
coclustering_dictionary_file_path,
max_preserved_information=0,
max_cells=0,
max_part_numbers=None,
build_cluster_variable=True,
build_distance_variables=False,
build_frequency_variables=False,
variables_prefix="",
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Prepares a *individual-variable* coclustering deployment
Parameters
----------
dictionary_file_path_or_domain : str or `.DictionaryDomain`
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str
Name of the dictionary to be analyzed.
coclustering_file_path : str
Path of the coclustering model file (extension ``.khc`` or ``.khcj``).
table_variable : str
Name of the table variable in the dictionary.
deployed_variable_name : str
Name of the coclustering variable to deploy.
coclustering_dictionary_file_path : str
Path of the coclustering dictionary file for deployment.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0
there is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there
is no limit.
max_part_numbers : dict, optional
Dictionary associating variable names to their maximum number of parts to
preserve in the simplified coclustering. For variables not present in
``max_part_numbers`` there is no limit.
build_cluster_variable : bool, default ``True``
If ``True`` includes a cluster id variable in the deployment.
build_distance_variables : bool, default ``False``
If ``True`` includes a cluster distance variable in the deployment.
build_frequency_variables : bool, default ``False``
If ``True`` includes the frequency variables in the deployment.
variables_prefix : str, default ""
Prefix for the variables in the deployment dictionary.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Raises
------
`TypeError`
Invalid type of an argument
"""
def extract_clusters(
coclustering_file_path,
cluster_variable,
clusters_file_path,
max_preserved_information=0,
max_cells=0,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Extracts clusters to a tab separated (TSV) file
Parameters
----------
coclustering_file_path : str
Path of the coclustering model file (extension ``.khc`` or ``.khcj``).
cluster_variable : str
Name of the variable for which the clusters are extracted.
clusters_file_path : str
Path of the output clusters TSV file.
max_preserved_information : int, default 0
Maximum information preserve in the simplified coclustering. If equal to 0 there
is no limit.
max_cells : int, default 0
Maximum number of cells in the simplified coclustering. If equal to 0 there is
no limit.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
"""
def detect_data_table_format(
data_table_path,
dictionary_file_path_or_domain=None,
dictionary_name=None,
log_file_path=None,
output_scenario_path=None,
task_file_path=None,
trace=False,
stdout_file_path="",
stderr_file_path="",
max_cores=None,
memory_limit_mb=None,
temp_dir="",
scenario_prologue="",
**kwargs,
):
"""Detects the format of a data table
Runs an heuristic to detect the format of a data table. The detection heuristic is
more accurate if a dictionary with the table schema is provided.
Parameters
----------
data_table_path : str
Path of the data table file.
dictionary_file_path_or_domain : str or `.DictionaryDomain`, optional
Path of a Khiops dictionary file or a DictionaryDomain object.
dictionary_name : str, optional
Name of the dictionary.
log_file_path : str, default ""
Path of the log file for the Khiops process (command line option ``-e`` of the desktop app). If
equal to "" then it writes no log file.
output_scenario_path : str, default ""
Path of the output Khiops scenario file (command line option ``-o`` of the desktop app). If
the empty string is specified no output scenario file is generated.
task_file_path : str, default ""
Path of the task file for the Khiops process (command line option ``-p`` of the desktop app). If
equal to "" then it writes no task file.
trace : bool, default ``False``
If True prints the command line executed of the process and does not delete any temporary files
created.
stdout_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stdout stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
stderr_file_path : str, default ""
*Advanced* Path to a file where the Khiops process writes its stderr stream. Normally Khiops
should not write to this stream but MPI, filesystems plugins or debug versions may do it. The
stream is captured with a UTF-8 encoding and replacing encoding errors. If equal to "" then it
writes no file.
max_cores: int, optional
*Advanced* Maximum number of cores for Khiops executions. If not set, then Khiops uses all
available CPU cores in the system.
memory_limit_mb: int, optional
*Advanced* Maximum amount of memory (in MB) for Khiops executions. If not set, then Khiops
uses all available system memory.
temp_dir: str, default ""
*Advanced* Temporary directory for Khiops executions. If set to "", then Khiops uses the
system's temporary directory.
scenario_prologue: str, default ""
*Advanced* Prologue to prepend to all Khiops execution scenarios.
force_ansi_scenario : bool, default ``False``
*Advanced* If True the internal scenario generated by Khiops will force characters such as
accentuated ones to be decoded with the UTF8->ANSI khiops transformation.
Returns
-------
tuple
A 2-tuple containing:
- the ``header_line`` boolean
- the ``field_separator`` character
These are exactly the parameters expected in many Khiops Python API functions.
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
Priority/0-High
To do now
Size/Days
Some days of work
Status/InDevelopment
The issue is in development by one or more team members
Description
Khiops Core v11 release is approaching. Hence, support should be done at the following levels:
Other associated issues:
Text
andTextList
Type Support to the Core APIs #330The text was updated successfully, but these errors were encountered: