Skip to content

ML TF-IDF #394

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 68 commits into from
Mar 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
45404bb
bare bones bucketizer
Dec 16, 2019
a56db1d
Merge branch 'master' of github.com:dotnet/spark
Dec 29, 2019
95d0014
implement bucketizer
Dec 29, 2019
fb2d019
first tests
Dec 29, 2019
d759e60
multi column tests
Dec 29, 2019
160fbf4
Merge branch 'master' into bucketizer-ml-313
GoEddie Jan 7, 2020
97ef668
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
4543974
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
fd18cf4
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
64551c9
Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
GoEddie Jan 8, 2020
fb70f40
tidying
Jan 8, 2020
119f14d
Merge branch 'bucketizer-ml-313' of github.com:GoEddie/spark into buc…
Jan 8, 2020
9891847
changes after review
Jan 8, 2020
e2ce736
TF-IDF
Jan 9, 2020
3cc3f8d
removing step
Jan 9, 2020
6cfd0e4
single test for IDF and IDFModel
Jan 9, 2020
d466ea2
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
Jan 13, 2020
633a843
SerDe to handle double[][] for Bucketizer
Jan 13, 2020
f4ecbb0
remove DoubleArrayArrayParam
Jan 13, 2020
b3d4d0f
SerDe for double[][]
Jan 13, 2020
500e7ad
spacing as per other fields
Jan 13, 2020
298f4ec
formatting
Jan 13, 2020
72d36fd
adding getters to tests
Jan 13, 2020
696186c
rollback
Jan 13, 2020
33699ea
Apply suggestions from code review
GoEddie Jan 15, 2020
5b80606
Fixing comments after review
Jan 15, 2020
e771f86
Merge branch 'master' of github.com:dotnet/spark into bucketizer-ml-313
Jan 15, 2020
2f2827a
Merge branch 'bucketizer-ml-313' into ml/HashingTF
Jan 15, 2020
6c12e6a
fixes after review
Jan 15, 2020
a03ddc6
Merge branch 'bucketizer-ml-313' into ml/HashingTF
Jan 15, 2020
dc7bf4b
wip
Jan 15, 2020
283f8ea
Hashing TF from ml not mllib
Jan 15, 2020
9d0f7ea
tests for HashingTF
Jan 16, 2020
107e01b
adding tests
Jan 16, 2020
33e50f2
formatting
Jan 18, 2020
d85ca33
removing project, in spark main project
Jan 18, 2020
3af69f9
merge
Jan 18, 2020
c15ad6b
merge
Jan 18, 2020
5c358d1
testing
Jan 18, 2020
9234dba
formatting
Jan 18, 2020
a524396
tidying:
Jan 18, 2020
fa9c065
removing change
Jan 18, 2020
13adf7b
removing change
Jan 18, 2020
9147c12
docs
Jan 18, 2020
90937a8
formatting
Jan 18, 2020
c5b604a
Merge branch 'master' of github.com:dotnet/spark into ml/HashingTF
Jan 27, 2020
c28ac3f
Merge branch 'master' into ml/HashingTF
GoEddie Jan 29, 2020
6675137
Merge branch 'master' into ml/HashingTF
imback82 Feb 3, 2020
adca1d6
Apply suggestions from code review
GoEddie Feb 5, 2020
44a4bb5
adding datatype udf where sqlType is available
Feb 7, 2020
f52f6eb
changes from code review
Feb 7, 2020
f425f29
Merge branch 'master' into ml/HashingTF
GoEddie Feb 9, 2020
15bae3e
feedback from review
Mar 2, 2020
30e95ad
Merge branch 'ml/HashingTF' of github.com:GoEddie/spark into ml/Hashi…
Mar 2, 2020
d55140d
Merge branch 'master' of github.com:dotnet/spark into ml/HashingTF
Mar 2, 2020
64066a5
fixes from feedback
Mar 2, 2020
37cf616
reverting fix for ArrayType
Mar 3, 2020
cd07e56
params comments
Mar 3, 2020
d12f348
Merge branch 'master' of github.com:dotnet/spark into ml/HashingTF
Mar 5, 2020
f36fc12
Merge branch 'master' into ml/HashingTF
imback82 Mar 8, 2020
5c9c2b6
Merge branch 'master' of github.com:dotnet/spark into ml/HashingTF
Mar 23, 2020
573fc1a
formatting and comments from feedback
Mar 23, 2020
4cd86e3
typo ideModel and not idfModel
Mar 23, 2020
57729ee
cant use var here
Mar 23, 2020
da7660e
cant use var here
Mar 23, 2020
22ff5e5
formatting from feedback
Mar 25, 2020
1ecb215
Merge branch 'master' of github.com:dotnet/spark into ml/HashingTF
Mar 25, 2020
8e665a3
Merge branch 'master' into ml/HashingTF
imback82 Mar 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// See the LICENSE file in the project root for more information.

using System.Collections.Generic;
using System.IO;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;
Expand Down Expand Up @@ -47,6 +49,15 @@ public void TestBucketizer()
Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
Assert.Equal(expectedSplits, bucketizer.GetSplits());

using (var tempDirectory = new TemporaryDirectory())
{
string savePath = Path.Join(tempDirectory.Path, "bucket");
bucketizer.Save(savePath);

Bucketizer loadedBucketizer = Bucketizer.Load(savePath);
Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
}
}

[Fact]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class HashingTFTests
{
private readonly SparkSession _spark;

public HashingTFTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestHashingTF()
{
string expectedInputCol = "input_col";
string expectedOutputCol = "output_col";
int expectedFeatures = 10;

Assert.IsType<HashingTF>(new HashingTF());

HashingTF hashingTf = new HashingTF("my-unique-id")
.SetNumFeatures(expectedFeatures)
.SetInputCol(expectedInputCol)
.SetOutputCol(expectedOutputCol);

Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures());
Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());

DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
" as input_col");

DataFrame output = hashingTf.Transform(input);
DataFrame outputVector = output.Select(expectedOutputCol);

Assert.Contains(expectedOutputCol, outputVector.Columns());

using (var tempDirectory = new TemporaryDirectory())
{
string savePath = Path.Join(tempDirectory.Path, "hashingTF");
hashingTf.Save(savePath);

HashingTF loadedHashingTf = HashingTF.Load(savePath);
Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
}

hashingTf.SetBinary(true);
Assert.True(hashingTf.GetBinary());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class IDFModelTests
{
private readonly SparkSession _spark;

public IDFModelTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestIDFModel()
{
int expectedDocFrequency = 1980;
string expectedInputCol = "rawFeatures";
string expectedOutputCol = "features";

DataFrame sentenceData =
_spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

Tokenizer tokenizer = new Tokenizer()
.SetInputCol("sentence")
.SetOutputCol("words");

DataFrame wordsData = tokenizer.Transform(sentenceData);

HashingTF hashingTF = new HashingTF()
.SetInputCol("words")
.SetOutputCol(expectedInputCol)
.SetNumFeatures(20);

DataFrame featurizedData = hashingTF.Transform(wordsData);

IDF idf = new IDF()
.SetInputCol(expectedInputCol)
.SetOutputCol(expectedOutputCol)
.SetMinDocFreq(expectedDocFrequency);

IDFModel idfModel = idf.Fit(featurizedData);

DataFrame rescaledData = idfModel.Transform(featurizedData);
Assert.Contains(expectedOutputCol, rescaledData.Columns());

Assert.Equal(expectedInputCol, idfModel.GetInputCol());
Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

using (var tempDirectory = new TemporaryDirectory())
{
string modelPath = Path.Join(tempDirectory.Path, "idfModel");
idfModel.Save(modelPath);

IDFModel loadedModel = IDFModel.Load(modelPath);
Assert.Equal(idfModel.Uid(), loadedModel.Uid());
}
}
}
}
49 changes: 49 additions & 0 deletions src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class IDFTests
{
private readonly SparkSession _spark;

public IDFTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestIDFModel()
{
string expectedInputCol = "rawFeatures";
string expectedOutputCol = "features";
int expectedDocFrequency = 100;

IDF idf = new IDF()
.SetInputCol(expectedInputCol)
.SetOutputCol(expectedOutputCol)
.SetMinDocFreq(expectedDocFrequency);

Assert.Equal(expectedInputCol, idf.GetInputCol());
Assert.Equal(expectedOutputCol, idf.GetOutputCol());
Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq());

using (var tempDirectory = new TemporaryDirectory())
{
string savePath = Path.Join(tempDirectory.Path, "IDF");
idf.Save(savePath);

IDF loadedIdf = IDF.Load(savePath);
Assert.Equal(idf.Uid(), loadedIdf.Uid());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.IO;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class TokenizerTests
{
private readonly SparkSession _spark;

public TokenizerTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

[Fact]
public void TestTokenizer()
{
string expectedUid = "theUid";
string expectedInputCol = "input_col";
string expectedOutputCol = "output_col";

DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
" from range(100)");

Tokenizer tokenizer = new Tokenizer(expectedUid)
.SetInputCol(expectedInputCol)
.SetOutputCol(expectedOutputCol);

DataFrame output = tokenizer.Transform(input);

Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
Assert.Equal(expectedInputCol, tokenizer.GetInputCol());
Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol());

using (var tempDirectory = new TemporaryDirectory())
{
string savePath = Path.Join(tempDirectory.Path, "Tokenizer");
tokenizer.Save(savePath);

Tokenizer loadedTokenizer = Tokenizer.Load(savePath);
Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid());
}

Assert.Equal(expectedUid, tokenizer.Uid());
}
}
}
Loading