Skip to content

Commit daa5162

Browse files
cavenesstfx-copybara
authored andcommitted
Add support for validating that number of unique values is within a user-specified range.
PiperOrigin-RevId: 315969158
1 parent a80c0b0 commit daa5162

File tree

11 files changed

+632
-163
lines changed

11 files changed

+632
-163
lines changed

RELEASE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
limit on number of examples.
1616
* Fix bug in display_anomalies that prevented dataset-level anomalies from
1717
being displayed.
18+
* Trigger anomalies when a feature has a number of unique values that does not
19+
conform to the specified minimum/maximum.
1820

1921
## Known Issues
2022

g3doc/anomalies.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,35 @@ condition(s) under which each anomaly type is detected.
193193

194194
- Anomaly type not detected in TFDV
195195

196+
- `FEATURE_TYPE_HIGH_UNIQUE`
197+
198+
- Schema Fields:
199+
- `feature.unique_constraints.max`
200+
- Statistics Fields:
201+
- `feature.string_stats.unique`
202+
- Detection Condition:
203+
- `feature.string_stats.unique` > `feature.unique_constraints.max`
204+
205+
- `FEATURE_TYPE_LOW_UNIQUE`
206+
207+
- Schema Fields:
208+
- `feature.unique_constraints.min`
209+
- Statistics Fields:
210+
- `feature.string_stats.unique`
211+
- Detection Condition:
212+
- `feature.string_stats.unique` < `feature.unique_constraints.min`
213+
214+
- `FEATURE_TYPE_NO_UNIQUE`
215+
216+
- Schema Fields:
217+
- `feature.unique_constraints`
218+
- Statistics Fields:
219+
- `feature.string_stats.unique`
220+
- Detection Condition:
221+
- `feature.unique_constraints` specified but no
222+
`feature.string_stats.unique` present (as is the case where the
223+
feature is not a string or categorical)
224+
196225
- `FLOAT_TYPE_BIG_FLOAT`
197226

198227
- Schema Fields:

tensorflow_data_validation/anomalies/feature_util.cc

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,5 +320,42 @@ std::vector<Description> UpdatePresence(
320320
return descriptions;
321321
}
322322

323+
std::vector<Description> UpdateUniqueConstraints(
324+
const FeatureStatsView& feature_stats_view,
325+
tensorflow::metadata::v0::Feature* feature) {
326+
std::vector<Description> descriptions;
327+
const absl::optional<int> num_unique = feature_stats_view.GetNumUnique();
328+
if (num_unique) {
329+
if (num_unique < feature->unique_constraints().min()) {
330+
descriptions.push_back(
331+
{tensorflow::metadata::v0::AnomalyInfo::FEATURE_TYPE_LOW_UNIQUE,
332+
"Low number of unique values",
333+
absl::StrCat(
334+
"Expected at least ", feature->unique_constraints().min(),
335+
" unique values but found only ", num_unique.value(), ".")});
336+
feature->mutable_unique_constraints()->set_min(num_unique.value());
337+
}
338+
if (num_unique > feature->unique_constraints().max()) {
339+
descriptions.push_back(
340+
{tensorflow::metadata::v0::AnomalyInfo::FEATURE_TYPE_HIGH_UNIQUE,
341+
"High number of unique values",
342+
absl::StrCat("Expected no more than ",
343+
feature->unique_constraints().max(),
344+
" unique values but found ", num_unique.value(), ".")});
345+
feature->mutable_unique_constraints()->set_max(num_unique.value());
346+
}
347+
} else {
348+
descriptions.push_back(
349+
{tensorflow::metadata::v0::AnomalyInfo::FEATURE_TYPE_NO_UNIQUE,
350+
"No unique values",
351+
absl::StrCat(
352+
"UniqueConstraints specified for the feature, but unique values "
353+
"were not counted (i.e., feature is not string or "
354+
"categorical).")});
355+
feature->clear_unique_constraints();
356+
}
357+
return descriptions;
358+
}
359+
323360
} // namespace data_validation
324361
} // namespace tensorflow

tensorflow_data_validation/anomalies/feature_util.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@ double GetMaxOffDomain(const tensorflow::metadata::v0::DistributionConstraints&
107107

108108
// Clear the domain of the feature.
109109
void ClearDomain(tensorflow::metadata::v0::Feature* feature);
110+
111+
// Updates the UniqueConstraints specified for the feature.
112+
std::vector<Description> UpdateUniqueConstraints(
113+
const FeatureStatsView& feature_stats_view,
114+
tensorflow::metadata::v0::Feature* feature);
115+
110116
} // namespace data_validation
111117
} // namespace tensorflow
112118
#endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURE_UTIL_H_

tensorflow_data_validation/anomalies/feature_util_test.cc

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,194 @@ TEST(FeatureUtilTest,
273273
EXPECT_EQ(actual_descriptions.size(), 0);
274274
}
275275

276+
TEST(FeatureUtilTest, UpdateUniqueConstraintsNoChange) {
277+
DatasetFeatureStatistics statistics =
278+
ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
279+
features: {
280+
name: "categorical_feature"
281+
type: INT
282+
string_stats {
283+
common_stats: {
284+
num_missing: 0
285+
num_non_missing: 2
286+
min_num_values: 1
287+
max_num_values: 1
288+
}
289+
unique: 5
290+
}
291+
},
292+
features: {
293+
name: "string_feature"
294+
type: STRING
295+
string_stats {
296+
common_stats: {
297+
num_missing: 0
298+
num_non_missing: 2
299+
min_num_values: 1
300+
max_num_values: 1
301+
}
302+
unique: 1
303+
}
304+
})");
305+
306+
DatasetStatsView stats_view(statistics);
307+
const FeatureStatsView categorical_feature_stats_view =
308+
stats_view.GetByPath(Path({"categorical_feature"})).value();
309+
const FeatureStatsView string_feature_stats_view =
310+
stats_view.GetByPath(Path({"string_feature"})).value();
311+
312+
Feature original_categorical_feature =
313+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
314+
name: "categorical_feature"
315+
type: INT
316+
int_domain { is_categorical: true }
317+
unique_constraints { min: 1 max: 5 })");
318+
Feature categorical_feature;
319+
categorical_feature.CopyFrom(original_categorical_feature);
320+
Feature original_string_feature =
321+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
322+
R"(name: "string_feature"
323+
type: BYTES
324+
unique_constraints { min: 1 max: 5 })");
325+
Feature string_feature;
326+
string_feature.CopyFrom(original_string_feature);
327+
328+
std::vector<Description> actual_categorical_descriptions =
329+
UpdateUniqueConstraints(categorical_feature_stats_view,
330+
&categorical_feature);
331+
std::vector<Description> actual_string_descriptions =
332+
UpdateUniqueConstraints(string_feature_stats_view, &string_feature);
333+
334+
// The feature is not changed, and no anomalies are generated.
335+
EXPECT_THAT(categorical_feature, EqualsProto(original_categorical_feature));
336+
EXPECT_EQ(actual_categorical_descriptions.size(), 0);
337+
EXPECT_THAT(string_feature, EqualsProto(original_string_feature));
338+
EXPECT_EQ(actual_string_descriptions.size(), 0);
339+
}
340+
341+
TEST(FeatureUtilTest, UpdateUniqueConstraintsNumUniquesOutsideRange) {
342+
DatasetFeatureStatistics statistics =
343+
ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
344+
features: {
345+
name: "categorical_feature"
346+
type: INT
347+
string_stats {
348+
common_stats: {
349+
num_missing: 0
350+
num_non_missing: 2
351+
min_num_values: 1
352+
max_num_values: 1
353+
}
354+
unique: 5
355+
}
356+
},
357+
features: {
358+
name: "string_feature"
359+
type: STRING
360+
string_stats {
361+
common_stats: {
362+
num_missing: 0
363+
num_non_missing: 2
364+
min_num_values: 1
365+
max_num_values: 1
366+
}
367+
unique: 1
368+
}
369+
})");
370+
371+
DatasetStatsView stats_view(statistics);
372+
const FeatureStatsView categorical_feature_stats_view =
373+
stats_view.GetByPath(Path({"categorical_feature"})).value();
374+
const FeatureStatsView string_feature_stats_view =
375+
stats_view.GetByPath(Path({"string_feature"})).value();
376+
377+
Feature categorical_feature =
378+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
379+
name: "categorical_feature"
380+
type: INT
381+
int_domain { is_categorical: true }
382+
unique_constraints { min: 2 max: 2 })");
383+
Feature string_feature =
384+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
385+
R"(name: "string_feature"
386+
type: BYTES
387+
unique_constraints { min: 2 max: 2 })");
388+
389+
// The number of unique values for the categorical feature is higher than the
390+
// original unique_constraints.max for that feature, so expect that the max
391+
// will be updated.
392+
Feature expected_categorical_feature =
393+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
394+
name: "categorical_feature"
395+
type: INT
396+
int_domain { is_categorical: true }
397+
unique_constraints { min: 2 max: 5 })");
398+
// The number of unique values for the string feature is lower than the
399+
// original unique_constraints.min for that feature, so expect that the
400+
// min will be updated.
401+
Feature expected_string_feature =
402+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
403+
R"(name: "string_feature"
404+
type: BYTES
405+
unique_constraints { min: 1 max: 2 })");
406+
407+
std::vector<Description> actual_categorical_descriptions =
408+
UpdateUniqueConstraints(categorical_feature_stats_view,
409+
&categorical_feature);
410+
std::vector<Description> actual_string_descriptions =
411+
UpdateUniqueConstraints(string_feature_stats_view, &string_feature);
412+
413+
EXPECT_THAT(categorical_feature, EqualsProto(expected_categorical_feature));
414+
EXPECT_EQ(actual_categorical_descriptions.size(), 1);
415+
EXPECT_EQ(actual_categorical_descriptions.at(0).long_description,
416+
"Expected no more than 2 unique values but found 5.");
417+
EXPECT_THAT(string_feature, EqualsProto(expected_string_feature));
418+
EXPECT_EQ(actual_string_descriptions.size(), 1);
419+
EXPECT_EQ(actual_string_descriptions.at(0).long_description,
420+
"Expected at least 2 unique values but found only 1.");
421+
}
422+
423+
TEST(FeatureUtilTest, UpdateUniqueConstraintsNotStringOrCategorical) {
424+
DatasetFeatureStatistics statistics =
425+
ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
426+
features: {
427+
name: "numeric_feature"
428+
type: INT
429+
num_stats {
430+
common_stats: {
431+
num_missing: 0
432+
num_non_missing: 6
433+
min_num_values: 1
434+
max_num_values: 1
435+
}
436+
}
437+
})");
438+
439+
DatasetStatsView stats_view(statistics);
440+
const FeatureStatsView numeric_feature_stats_view =
441+
stats_view.GetByPath(Path({"numeric_feature"})).value();
442+
443+
Feature numeric_feature =
444+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
445+
name: "numeric_feature"
446+
type: INT
447+
unique_constraints { min: 5 max: 5 })");
448+
Feature expected_numeric_feature =
449+
ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
450+
name: "numeric_feature"
451+
type: INT)");
452+
453+
std::vector<Description> actual_numeric_descriptions =
454+
UpdateUniqueConstraints(numeric_feature_stats_view, &numeric_feature);
455+
456+
// The unique_constraints are cleared, and an anomaly is generated.
457+
EXPECT_THAT(numeric_feature, EqualsProto(expected_numeric_feature));
458+
EXPECT_EQ(actual_numeric_descriptions.size(), 1);
459+
EXPECT_EQ(actual_numeric_descriptions.at(0).long_description,
460+
"UniqueConstraints specified for the feature, but unique values "
461+
"were not counted (i.e., feature is not string or categorical).");
462+
}
463+
276464
// Confirm that the result of calling DeprecateFeature on a feature is
277465
// recognized as by FeatureIsDeprecated.
278466
TEST(FeatureTypeTest, DeprecateConsistency) {

tensorflow_data_validation/anomalies/schema.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,10 @@ std::vector<Description> Schema::UpdateFeatureInternal(
10591059
DCHECK(false);
10601060
}
10611061

1062+
if (feature->has_unique_constraints()) {
1063+
add_to_descriptions(UpdateUniqueConstraints(view, feature));
1064+
}
1065+
10621066
const std::vector<FeatureComparatorType> all_comparator_types = {
10631067
FeatureComparatorType::DRIFT, FeatureComparatorType::SKEW};
10641068
// Handle comparators here.

0 commit comments

Comments
 (0)