Skip to content

FeatureHasher #652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Oct 2, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c933914
FeatureHasher
Sep 3, 2020
0801108
tidy
Sep 4, 2020
ac26205
tidying comments
Sep 4, 2020
7e9e8cc
Merge branch 'master' into ml/FeatureHasher
imback82 Sep 8, 2020
3652b55
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
Sep 8, 2020
d6094e3
adding extra test for FeatureBase
Sep 8, 2020
19626f7
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
Sep 8, 2020
ffd0cfc
Trigger Build
Sep 8, 2020
a0356cc
missing file header
Sep 8, 2020
7921e04
comments
Sep 8, 2020
b5fcee2
naming better
Sep 8, 2020
01d40f8
indentation
Sep 8, 2020
89694dc
changes after feedback
Sep 9, 2020
2d8eaa1
test summary
Sep 9, 2020
bad829a
trigger build
Sep 9, 2020
012bd6b
Merge branch 'master' into ml/FeatureHasher
GoEddie Sep 11, 2020
cae76fc
Merge branch 'master' into ml/FeatureHasher
imback82 Sep 12, 2020
105d690
Merge branch 'master' into ml/FeatureHasher
GoEddie Sep 14, 2020
dd75d78
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
Sep 29, 2020
c2926b3
Merge branch 'master' into ml/FeatureHasher
suhsteve Sep 29, 2020
02f06a9
changes after review
Sep 29, 2020
72741fe
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
Sep 29, 2020
ecb9e5f
formatting
GoEddie Sep 29, 2020
b28c1a7
formatting
GoEddie Sep 29, 2020
e94a601
Reverting change
GoEddie Sep 29, 2020
88eec00
Merge branch 'master' into ml/FeatureHasher
GoEddie Oct 1, 2020
752e48d
Merge branch 'master' into ml/FeatureHasher
GoEddie Oct 2, 2020
54d55de
retrigger build
Oct 2, 2020
e2aeab7
Merge branch 'master' into ml/FeatureHasher
imback82 Oct 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public BucketizerTests(SparkFixture fixture)
[Fact]
public void TestBucketizer()
{
var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };
var expectedSplits = new double[]{ double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };

string expectedHandle = "skip";
string expectedUid = "uid";
Expand Down Expand Up @@ -60,18 +60,7 @@ public void TestBucketizer()
Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
}

Assert.NotEmpty(bucketizer.ExplainParams());

Param handleInvalidParam = bucketizer.GetParam("handleInvalid");
Assert.NotEmpty(handleInvalidParam.Doc);
Assert.NotEmpty(handleInvalidParam.Name);
Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid());

Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam));
bucketizer.Set(handleInvalidParam, "keep");
Assert.Equal("keep", bucketizer.GetHandleInvalid());

Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid());
FeatureBaseTests<Bucketizer>.TestBase(bucketizer, "handleInvalid", "keep");
}

[Fact]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public CountVectorizerModelTests(SparkFixture fixture)
}

/// <summary>
/// Test that we can create a CountVectorizerModel, pass in a specifc vocabulary to use
/// Test that we can create a CountVectorizerModel, pass in a specific vocabulary to use
/// when creating the model. Verify the standard features methods as well as load/save.
/// </summary>
[Fact]
Expand Down Expand Up @@ -68,6 +68,8 @@ public void TestCountVectorizerModel()
Assert.IsType<int>(countVectorizerModel.GetVocabSize());
Assert.NotEmpty(countVectorizerModel.ExplainParams());
Assert.NotEmpty(countVectorizerModel.ToString());

FeatureBaseTests<CountVectorizerModel>.TestBase(countVectorizerModel, "minDF", 100);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ public void TestCountVectorizer()

Assert.NotEmpty(countVectorizer.ExplainParams());
Assert.NotEmpty(countVectorizer.ToString());

FeatureBaseTests<CountVectorizer>.TestBase(countVectorizer, "minDF", 0.4);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.ML.Feature.Param;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
public static class FeatureBaseTests<T>
{
internal static void TestBase(
FeatureBase<T> testObject,
string paramName,
object paramValue)
{
Assert.NotEmpty(testObject.ExplainParams());

Param param = testObject.GetParam(paramName);
Assert.NotEmpty(param.Doc);
Assert.NotEmpty(param.Name);
Assert.Equal(param.Parent, testObject.Uid());

Assert.NotEmpty(testObject.ExplainParam(param));
testObject.Set(param, paramValue);
Assert.IsAssignableFrom<Identifiable>(testObject.Clear(param));

Assert.IsType<string>(testObject.Uid());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class FeatureHasherTests
{
private readonly SparkSession _spark;

public FeatureHasherTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestFeatureHasher()
{
DataFrame dataFrame = _spark.CreateDataFrame(
new List<GenericRow>
{
new GenericRow(new object[] {2.0D, true, "1", "foo"}),
new GenericRow(new object[] {3.0D, false, "2", "bar"})
},
new StructType(new List<StructField>
{
new StructField("real", new DoubleType()),
new StructField("bool", new BooleanType()),
new StructField("stringNum", new StringType()),
new StructField("string", new StringType())
}));

FeatureHasher hasher = new FeatureHasher()
.SetInputCols(new List<string>() {"real", "bool", "stringNum", "string"})
.SetOutputCol("features")
.SetCategoricalCols(new List<string>() {"real", "string"})
.SetNumFeatures(10);

Assert.IsType<string>(hasher.GetOutputCol());
Assert.IsType <string[]>(hasher.GetInputCols());
Assert.IsType<List<string>>(hasher.GetCategoricalCols());
Assert.IsType<int>(hasher.GetNumFeatures());
Assert.IsType<StructType>(hasher.TransformSchema(dataFrame.Schema()));
Assert.IsType<DataFrame>(hasher.Transform(dataFrame));

FeatureBaseTests<FeatureHasher>.TestBase(hasher, "numFeatures", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ public void TestHashingTF()

hashingTf.SetBinary(true);
Assert.True(hashingTf.GetBinary());

FeatureBaseTests<HashingTF>.TestBase(hashingTf, "numFeatures", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ public void TestIDFModel()
IDFModel loadedModel = IDFModel.Load(modelPath);
Assert.Equal(idfModel.Uid(), loadedModel.Uid());
}

FeatureBaseTests<IDFModel>.TestBase(idfModel, "minDocFreq", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ public void TestIDFModel()
IDF loadedIdf = IDF.Load(savePath);
Assert.Equal(idf.Uid(), loadedIdf.Uid());
}

FeatureBaseTests<IDF>.TestBase(idf, "minDocFreq", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ public void TestTokenizer()
}

Assert.Equal(expectedUid, tokenizer.Uid());

FeatureBaseTests<Tokenizer>.TestBase(tokenizer, "inputCol", "input_col");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ public void TestWord2VecModel()
Word2VecModel loadedModel = Word2VecModel.Load(savePath);
Assert.Equal(model.Uid(), loadedModel.Uid());
}

FeatureBaseTests<Word2VecModel>.TestBase(model, "maxIter", 2);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ public void TestWord2Vec()
Word2Vec loadedWord2Vec = Word2Vec.Load(savePath);
Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid());
}

FeatureBaseTests<Word2Vec>.TestBase(word2vec, "maxIter", 2);
}
}
}
145 changes: 145 additions & 0 deletions src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Collections.Generic;
using System.Linq;
using Microsoft.Spark.Interop;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;

namespace Microsoft.Spark.ML.Feature
{
public class FeatureHasher: FeatureBase<FeatureHasher>, IJvmObjectReferenceProvider
{
private static readonly string s_featureHasherClassName =
"org.apache.spark.ml.feature.FeatureHasher";

internal FeatureHasher() : base(s_featureHasherClassName)
{
}

internal FeatureHasher(string uid) : base(s_featureHasherClassName, uid)
{
}

internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject)
{
}

JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;

/// <summary>
/// Loads the <see cref="FeatureHasher"/> that was previously saved using Save.
/// </summary>
/// <param name="path">
/// The path the previous <see cref="FeatureHasher"/> was saved to.
/// </param>
/// <returns>New <see cref="FeatureHasher"/> object</returns>
public static FeatureHasher Load(string path) =>
WrapAsFeatureHasher(
SparkEnvironment.JvmBridge.CallStaticJavaMethod(
s_featureHasherClassName,"load", path));

/// <summary>
/// Gets a list of the columns which have been specified as categorical columns.
/// </summary>
/// <returns>List of categorical columns, set by SetCategoricalCols</returns>
public IEnumerable<string> GetCategoricalCols() =>
((string[])_jvmObject.Invoke("getCategoricalCols")).ToList();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ToList required?


/// <summary>
/// Marks columns as categorical columns.
/// </summary>
/// <param name="value">List of column names to mark as categorical columns</param>
/// <returns>New <see cref="FeatureHasher"/> object</returns>
public FeatureHasher SetCategoricalCols(IEnumerable<string> value) =>
WrapAsFeatureHasher(_jvmObject.Invoke("setCategoricalCols", value));

/// <summary>
/// Gets the columns that the <see cref="FeatureHasher"/> should read from and convert into
/// hashes. This would have been set by SetInputCol.
/// </summary>
/// <returns>IEnumerable&lt;string&gt;, the input columns</returns>
public IEnumerable<string> GetInputCols() => (string[])_jvmObject.Invoke("getInputCols");

/// <summary>
/// Sets the columns that the <see cref="FeatureHasher"/> should read from and convert into
/// hashes.
/// </summary>
/// <param name="value">The name of the column to as use the source of the hash</param>
/// <returns>New <see cref="FeatureHasher"/> object</returns>
public FeatureHasher SetInputCols(IEnumerable<string> value) =>
WrapAsFeatureHasher(_jvmObject.Invoke("setInputCols", value));

/// <summary>
/// Gets the number of features that should be used. Since a simple modulo is used to
/// transform the hash function to a column index, it is advisable to use a power of two
/// as the numFeatures parameter; otherwise the features will not be mapped evenly to the
/// columns.
/// </summary>
/// <returns>The number of features to be used</returns>
public int GetNumFeatures() => (int)_jvmObject.Invoke("getNumFeatures");

/// <summary>
/// Sets the number of features that should be used. Since a simple modulo is used to
/// transform the hash function to a column index, it is advisable to use a power of two as
/// the numFeatures parameter; otherwise the features will not be mapped evenly to the
/// columns.
/// </summary>
/// <param name="value">int</param>
/// <returns>New <see cref="FeatureHasher"/> object</returns>
public FeatureHasher SetNumFeatures(int value) =>
WrapAsFeatureHasher(_jvmObject.Invoke("setNumFeatures", value));

/// <summary>
/// Gets the name of the column the output data will be written to. This is set by
/// SetInputCol.
/// </summary>
/// <returns>string, the output column</returns>
public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol");

/// <summary>
/// The <see cref="FeatureHasher"/> will create a new column in the DataFrame, this is the
/// name of the new column.
/// </summary>
/// <param name="value">The name of the new column which will contain the hash</param>
/// <returns>New <see cref="FeatureHasher"/> object</returns>
public FeatureHasher SetOutputCol(string value) =>
WrapAsFeatureHasher(_jvmObject.Invoke("setOutputCol", value));

/// <summary>
/// Transforms the input <see cref="DataFrame"/>. It is recommended that you validate that
/// the transform will succeed by calling TransformSchema.
/// </summary>
/// <param name="value">Input <see cref="DataFrame"/> to transform</param>
/// <returns>Transformed <see cref="DataFrame"/></returns>
public DataFrame Transform(DataFrame value) =>
new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", value));

/// <summary>
/// Check transform validity and derive the output schema from the input schema.
///
/// This checks for validity of interactions between parameters during Transform and
/// raises an exception if any parameter value is invalid.
///
/// Typical implementation should first conduct verification on schema change and parameter
/// validity, including complex parameter interaction checks.
/// </summary>
/// <param name="value">
/// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed.
/// </param>
/// <returns>
/// The <see cref="StructType"/> of the output schema that would have been derived from the
/// input schema, if Transform had been called.
/// </returns>
public StructType TransformSchema(StructType value) =>
new StructType(
(JvmObjectReference)_jvmObject.Invoke("transformSchema",
DataType.FromJson(_jvmObject.Jvm, value.Json)));

private static FeatureHasher WrapAsFeatureHasher(object obj) =>
new FeatureHasher((JvmObjectReference)obj);
}
}