@@ -273,6 +273,194 @@ TEST(FeatureUtilTest,
273
273
EXPECT_EQ (actual_descriptions.size (), 0 );
274
274
}
275
275
276
+ TEST (FeatureUtilTest, UpdateUniqueConstraintsNoChange) {
277
+ DatasetFeatureStatistics statistics =
278
+ ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
279
+ features: {
280
+ name: "categorical_feature"
281
+ type: INT
282
+ string_stats {
283
+ common_stats: {
284
+ num_missing: 0
285
+ num_non_missing: 2
286
+ min_num_values: 1
287
+ max_num_values: 1
288
+ }
289
+ unique: 5
290
+ }
291
+ },
292
+ features: {
293
+ name: "string_feature"
294
+ type: STRING
295
+ string_stats {
296
+ common_stats: {
297
+ num_missing: 0
298
+ num_non_missing: 2
299
+ min_num_values: 1
300
+ max_num_values: 1
301
+ }
302
+ unique: 1
303
+ }
304
+ })" );
305
+
306
+ DatasetStatsView stats_view (statistics);
307
+ const FeatureStatsView categorical_feature_stats_view =
308
+ stats_view.GetByPath (Path ({" categorical_feature" })).value ();
309
+ const FeatureStatsView string_feature_stats_view =
310
+ stats_view.GetByPath (Path ({" string_feature" })).value ();
311
+
312
+ Feature original_categorical_feature =
313
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
314
+ name: "categorical_feature"
315
+ type: INT
316
+ int_domain { is_categorical: true }
317
+ unique_constraints { min: 1 max: 5 })" );
318
+ Feature categorical_feature;
319
+ categorical_feature.CopyFrom (original_categorical_feature);
320
+ Feature original_string_feature =
321
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
322
+ R"( name: "string_feature"
323
+ type: BYTES
324
+ unique_constraints { min: 1 max: 5 })" );
325
+ Feature string_feature;
326
+ string_feature.CopyFrom (original_string_feature);
327
+
328
+ std::vector<Description> actual_categorical_descriptions =
329
+ UpdateUniqueConstraints (categorical_feature_stats_view,
330
+ &categorical_feature);
331
+ std::vector<Description> actual_string_descriptions =
332
+ UpdateUniqueConstraints (string_feature_stats_view, &string_feature);
333
+
334
+ // The feature is not changed, and no anomalies are generated.
335
+ EXPECT_THAT (categorical_feature, EqualsProto (original_categorical_feature));
336
+ EXPECT_EQ (actual_categorical_descriptions.size (), 0 );
337
+ EXPECT_THAT (string_feature, EqualsProto (original_string_feature));
338
+ EXPECT_EQ (actual_string_descriptions.size (), 0 );
339
+ }
340
+
341
+ TEST (FeatureUtilTest, UpdateUniqueConstraintsNumUniquesOutsideRange) {
342
+ DatasetFeatureStatistics statistics =
343
+ ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
344
+ features: {
345
+ name: "categorical_feature"
346
+ type: INT
347
+ string_stats {
348
+ common_stats: {
349
+ num_missing: 0
350
+ num_non_missing: 2
351
+ min_num_values: 1
352
+ max_num_values: 1
353
+ }
354
+ unique: 5
355
+ }
356
+ },
357
+ features: {
358
+ name: "string_feature"
359
+ type: STRING
360
+ string_stats {
361
+ common_stats: {
362
+ num_missing: 0
363
+ num_non_missing: 2
364
+ min_num_values: 1
365
+ max_num_values: 1
366
+ }
367
+ unique: 1
368
+ }
369
+ })" );
370
+
371
+ DatasetStatsView stats_view (statistics);
372
+ const FeatureStatsView categorical_feature_stats_view =
373
+ stats_view.GetByPath (Path ({" categorical_feature" })).value ();
374
+ const FeatureStatsView string_feature_stats_view =
375
+ stats_view.GetByPath (Path ({" string_feature" })).value ();
376
+
377
+ Feature categorical_feature =
378
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
379
+ name: "categorical_feature"
380
+ type: INT
381
+ int_domain { is_categorical: true }
382
+ unique_constraints { min: 2 max: 2 })" );
383
+ Feature string_feature =
384
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
385
+ R"( name: "string_feature"
386
+ type: BYTES
387
+ unique_constraints { min: 2 max: 2 })" );
388
+
389
+ // The number of unique values for the categorical feature is higher than the
390
+ // original unique_constraints.max for that feature, so expect that the max
391
+ // will be updated.
392
+ Feature expected_categorical_feature =
393
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
394
+ name: "categorical_feature"
395
+ type: INT
396
+ int_domain { is_categorical: true }
397
+ unique_constraints { min: 2 max: 5 })" );
398
+ // The number of unique values for the string feature is lower than the
399
+ // original unique_constraints.min for that feature, so expect that the
400
+ // min will be updated.
401
+ Feature expected_string_feature =
402
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(
403
+ R"( name: "string_feature"
404
+ type: BYTES
405
+ unique_constraints { min: 1 max: 2 })" );
406
+
407
+ std::vector<Description> actual_categorical_descriptions =
408
+ UpdateUniqueConstraints (categorical_feature_stats_view,
409
+ &categorical_feature);
410
+ std::vector<Description> actual_string_descriptions =
411
+ UpdateUniqueConstraints (string_feature_stats_view, &string_feature);
412
+
413
+ EXPECT_THAT (categorical_feature, EqualsProto (expected_categorical_feature));
414
+ EXPECT_EQ (actual_categorical_descriptions.size (), 1 );
415
+ EXPECT_EQ (actual_categorical_descriptions.at (0 ).long_description ,
416
+ " Expected no more than 2 unique values but found 5." );
417
+ EXPECT_THAT (string_feature, EqualsProto (expected_string_feature));
418
+ EXPECT_EQ (actual_string_descriptions.size (), 1 );
419
+ EXPECT_EQ (actual_string_descriptions.at (0 ).long_description ,
420
+ " Expected at least 2 unique values but found only 1." );
421
+ }
422
+
423
+ TEST (FeatureUtilTest, UpdateUniqueConstraintsNotStringOrCategorical) {
424
+ DatasetFeatureStatistics statistics =
425
+ ParseTextProtoOrDie<DatasetFeatureStatistics>(R"(
426
+ features: {
427
+ name: "numeric_feature"
428
+ type: INT
429
+ num_stats {
430
+ common_stats: {
431
+ num_missing: 0
432
+ num_non_missing: 6
433
+ min_num_values: 1
434
+ max_num_values: 1
435
+ }
436
+ }
437
+ })" );
438
+
439
+ DatasetStatsView stats_view (statistics);
440
+ const FeatureStatsView numeric_feature_stats_view =
441
+ stats_view.GetByPath (Path ({" numeric_feature" })).value ();
442
+
443
+ Feature numeric_feature =
444
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
445
+ name: "numeric_feature"
446
+ type: INT
447
+ unique_constraints { min: 5 max: 5 })" );
448
+ Feature expected_numeric_feature =
449
+ ParseTextProtoOrDie<tensorflow::metadata::v0::Feature>(R"(
450
+ name: "numeric_feature"
451
+ type: INT)" );
452
+
453
+ std::vector<Description> actual_numeric_descriptions =
454
+ UpdateUniqueConstraints (numeric_feature_stats_view, &numeric_feature);
455
+
456
+ // The unique_constraints are cleared, and an anomaly is generated.
457
+ EXPECT_THAT (numeric_feature, EqualsProto (expected_numeric_feature));
458
+ EXPECT_EQ (actual_numeric_descriptions.size (), 1 );
459
+ EXPECT_EQ (actual_numeric_descriptions.at (0 ).long_description ,
460
+ " UniqueConstraints specified for the feature, but unique values "
461
+ " were not counted (i.e., feature is not string or categorical)." );
462
+ }
463
+
276
464
// Confirm that the result of calling DeprecateFeature on a feature is
277
465
// recognized as by FeatureIsDeprecated.
278
466
TEST (FeatureTypeTest, DeprecateConsistency) {
0 commit comments