|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | +// See the LICENSE file in the project root for more information. |
| 4 | + |
| 5 | +using Microsoft.Spark.Interop; |
| 6 | +using Microsoft.Spark.Interop.Ipc; |
| 7 | +using Microsoft.Spark.Sql; |
| 8 | + |
| 9 | +namespace Microsoft.Spark.ML.Feature |
| 10 | +{ |
| 11 | + public class CountVectorizer : FeatureBase<CountVectorizer>, IJvmObjectReferenceProvider |
| 12 | + { |
| 13 | + private static readonly string s_countVectorizerClassName = |
| 14 | + "org.apache.spark.ml.feature.CountVectorizer"; |
| 15 | + |
| 16 | + /// <summary> |
| 17 | + /// Creates a <see cref="CountVectorizer"/> without any parameters. |
| 18 | + /// </summary> |
| 19 | + public CountVectorizer() : base(s_countVectorizerClassName) |
| 20 | + { |
| 21 | + } |
| 22 | + |
| 23 | + /// <summary> |
| 24 | + /// Creates a <see cref="CountVectorizer"/> with a UID that is used to give the |
| 25 | + /// <see cref="CountVectorizer"/> a unique ID. |
| 26 | + /// </summary> |
| 27 | + /// <param name="uid">An immutable unique ID for the object and its derivatives.</param> |
| 28 | + public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid) |
| 29 | + { |
| 30 | + } |
| 31 | + |
| 32 | + internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject) |
| 33 | + { |
| 34 | + } |
| 35 | + |
| 36 | + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; |
| 37 | + |
| 38 | + /// <summary>Fits a model to the input data.</summary> |
| 39 | + /// <param name="dataFrame">The <see cref="DataFrame"/> to fit the model to.</param> |
| 40 | + /// <returns><see cref="CountVectorizerModel"/></returns> |
| 41 | + public CountVectorizerModel Fit(DataFrame dataFrame) => |
| 42 | + new CountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); |
| 43 | + |
| 44 | + /// <summary> |
| 45 | + /// Loads the <see cref="CountVectorizer"/> that was previously saved using Save. |
| 46 | + /// </summary> |
| 47 | + /// <param name="path"> |
| 48 | + /// The path the previous <see cref="CountVectorizer"/> was saved to. |
| 49 | + /// </param> |
| 50 | + /// <returns>New <see cref="CountVectorizer"/> object</returns> |
| 51 | + public static CountVectorizer Load(string path) => |
| 52 | + WrapAsCountVectorizer((JvmObjectReference) |
| 53 | + SparkEnvironment.JvmBridge.CallStaticJavaMethod( |
| 54 | + s_countVectorizerClassName,"load", path)); |
| 55 | + |
| 56 | + /// <summary> |
| 57 | + /// Gets the binary toggle to control the output vector values. If True, all nonzero counts |
| 58 | + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic |
| 59 | + /// models that model binary events rather than integer counts. Default: false |
| 60 | + /// </summary> |
| 61 | + /// <returns>boolean</returns> |
| 62 | + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); |
| 63 | + |
| 64 | + /// <summary> |
| 65 | + /// Sets the binary toggle to control the output vector values. If True, all nonzero counts |
| 66 | + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic |
| 67 | + /// models that model binary events rather than integer counts. Default: false |
| 68 | + /// </summary> |
| 69 | + /// <param name="value">Turn the binary toggle on or off</param> |
| 70 | + /// <returns><see cref="CountVectorizer"/> with the new binary toggle value set</returns> |
| 71 | + public CountVectorizer SetBinary(bool value) => |
| 72 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); |
| 73 | + |
| 74 | + /// <summary> |
| 75 | + /// Gets the column that the <see cref="CountVectorizer"/> should read from and convert |
| 76 | + /// into buckets. This would have been set by SetInputCol. |
| 77 | + /// </summary> |
| 78 | + /// <returns>The input column of type string</returns> |
| 79 | + public string GetInputCol() => (string)_jvmObject.Invoke("getInputCol"); |
| 80 | + |
| 81 | + /// <summary> |
| 82 | + /// Sets the column that the <see cref="CountVectorizer"/> should read from. |
| 83 | + /// </summary> |
| 84 | + /// <param name="value">The name of the column to use as the source.</param> |
| 85 | + /// <returns><see cref="CountVectorizer"/> with the input column set</returns> |
| 86 | + public CountVectorizer SetInputCol(string value) => |
| 87 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); |
| 88 | + |
| 89 | + /// <summary> |
| 90 | + /// Gets the name of the new column the <see cref="CountVectorizer"/> creates in the |
| 91 | + /// DataFrame. |
| 92 | + /// </summary> |
| 93 | + /// <returns>The name of the output column.</returns> |
| 94 | + public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); |
| 95 | + |
| 96 | + /// <summary> |
| 97 | + /// Sets the name of the new column the <see cref="CountVectorizer"/> creates in the |
| 98 | + /// DataFrame. |
| 99 | + /// </summary> |
| 100 | + /// <param name="value">The name of the output column which will be created.</param> |
| 101 | + /// <returns>New <see cref="CountVectorizer"/> with the output column set</returns> |
| 102 | + public CountVectorizer SetOutputCol(string value) => |
| 103 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); |
| 104 | + |
| 105 | + /// <summary> |
| 106 | + /// Gets the maximum number of different documents a term could appear in to be included in |
| 107 | + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is |
| 108 | + /// an integer greater than or equal to 1, this specifies the maximum number of documents |
| 109 | + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum |
| 110 | + /// fraction of documents the term could appear in. |
| 111 | + /// </summary> |
| 112 | + /// <returns>The maximum document term frequency</returns> |
| 113 | + [Since(Versions.V2_4_0)] |
| 114 | + public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); |
| 115 | + |
| 116 | + /// <summary> |
| 117 | + /// Sets the maximum number of different documents a term could appear in to be included in |
| 118 | + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is |
| 119 | + /// an integer greater than or equal to 1, this specifies the maximum number of documents |
| 120 | + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum |
| 121 | + /// fraction of documents the term could appear in. |
| 122 | + /// </summary> |
| 123 | + /// <param name="value">The maximum document term frequency</param> |
| 124 | + /// <returns>New <see cref="CountVectorizer"/> with the max df value set</returns> |
| 125 | + [Since(Versions.V2_4_0)] |
| 126 | + public CountVectorizer SetMaxDF(double value) => |
| 127 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); |
| 128 | + |
| 129 | + /// <summary> |
| 130 | + /// Gets the minimum number of different documents a term must appear in to be included in |
| 131 | + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the |
| 132 | + /// number of documents the term must appear in; if this is a double in [0,1), then this |
| 133 | + /// specifies the fraction of documents. |
| 134 | + /// </summary> |
| 135 | + /// <returns>The minimum document term frequency</returns> |
| 136 | + public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); |
| 137 | + |
| 138 | + /// <summary> |
| 139 | + /// Sets the minimum number of different documents a term must appear in to be included in |
| 140 | + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the |
| 141 | + /// number of documents the term must appear in; if this is a double in [0,1), then this |
| 142 | + /// specifies the fraction of documents. |
| 143 | + /// </summary> |
| 144 | + /// <param name="value">The minimum document term frequency</param> |
| 145 | + /// <returns>New <see cref="CountVectorizer"/> with the min df value set</returns> |
| 146 | + public CountVectorizer SetMinDF(double value) => |
| 147 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinDF", value)); |
| 148 | + |
| 149 | + /// <summary> |
| 150 | + /// Gets the filter to ignore rare words in a document. For each document, terms with |
| 151 | + /// frequency/count less than the given threshold are ignored. If this is an integer |
| 152 | + /// greater than or equal to 1, then this specifies a count (of times the term must appear |
| 153 | + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of |
| 154 | + /// the document's token count). |
| 155 | + /// |
| 156 | + /// Note that the parameter is only used in transform of CountVectorizerModel and does not |
| 157 | + /// affect fitting. |
| 158 | + /// </summary> |
| 159 | + /// <returns>Minimum term frequency</returns> |
| 160 | + public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); |
| 161 | + |
| 162 | + /// <summary> |
| 163 | + /// Sets the filter to ignore rare words in a document. For each document, terms with |
| 164 | + /// frequency/count less than the given threshold are ignored. If this is an integer |
| 165 | + /// greater than or equal to 1, then this specifies a count (of times the term must appear |
| 166 | + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of |
| 167 | + /// the document's token count). |
| 168 | + /// |
| 169 | + /// Note that the parameter is only used in transform of CountVectorizerModel and does not |
| 170 | + /// affect fitting. |
| 171 | + /// </summary> |
| 172 | + /// <param name="value">Minimum term frequency</param> |
| 173 | + /// <returns>New <see cref="CountVectorizer"/> with the min term frequency set</returns> |
| 174 | + public CountVectorizer SetMinTF(double value) => |
| 175 | + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); |
| 176 | + |
| 177 | + /// <summary> |
| 178 | + /// Gets the max size of the vocabulary. <see cref="CountVectorizer"/> will build a |
| 179 | + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across |
| 180 | + /// the corpus. |
| 181 | + /// </summary> |
| 182 | + /// <returns>The max size of the vocabulary of type int.</returns> |
| 183 | + public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); |
| 184 | + |
| 185 | + /// <summary> |
| 186 | + /// Sets the max size of the vocabulary. <see cref="CountVectorizer"/> will build a |
| 187 | + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across |
| 188 | + /// the corpus. |
| 189 | + /// </summary> |
| 190 | + /// <param name="value">The max vocabulary size</param> |
| 191 | + /// <returns><see cref="CountVectorizer"/> with the max vocab value set</returns> |
| 192 | + public CountVectorizer SetVocabSize(int value) => |
| 193 | + WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); |
| 194 | + |
| 195 | + private static CountVectorizer WrapAsCountVectorizer(object obj) => |
| 196 | + new CountVectorizer((JvmObjectReference)obj); |
| 197 | + } |
| 198 | +} |
0 commit comments