fix: explore_eval don't learn if logged action not in predicted actions (#4262)

olgavrou · web-flow · commit 0406c0f9310d · 2022-11-08T10:47:21.000-05:00
diff --git a/vowpalwabbit/core/src/reductions/explore_eval.cc b/vowpalwabbit/core/src/reductions/explore_eval.cc
@@ -144,6 +144,7 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e
     data.action_label = std::move(label_example->l.cb);
     label_example->l.cb = std::move(data.empty_label);
   }
+
   multiline_learn_or_predict<false>(base, ec_seq, data.offset);
 
   if (label_example != nullptr)  // restore label
@@ -159,11 +160,18 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e
     VW::action_scores& a_s = ec_seq[0]->pred.a_s;
 
     float action_probability = 0;
+    bool action_found = false;
     for (size_t i = 0; i < a_s.size(); i++)
     {
-      if (data.known_cost.action == a_s[i].action) { action_probability = a_s[i].score; }
+      if (data.known_cost.action == a_s[i].action)
+      {
+        action_probability = a_s[i].score;
+        action_found = true;
+      }
     }
 
+    if (!action_found) { return; }
+
     float threshold = action_probability / data.known_cost.probability;
 
     if (!data.fixed_multiplier) { data.multiplier = std::min(data.multiplier, 1 / threshold); }
@@ -183,15 +191,18 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e
         { ec_found = ec; }
         if (threshold > 1) { ec->weight *= threshold; }
       }
+
       ec_found->l.cb.costs[0].probability = action_probability;
 
       multiline_learn_or_predict<true>(base, ec_seq, data.offset);
 
+      // restore logged example
       if (threshold > 1)
       {
         float inv_threshold = 1.f / threshold;
         for (auto& ec : ec_seq) { ec->weight *= inv_threshold; }
       }
+
       ec_found->l.cb.costs[0].probability = data.known_cost.probability;
       data.update_count++;
     }

Original file line number	Diff line number	Diff line change
`@@ -144,6 +144,7 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e`
`144`	`144`	`data.action_label = std::move(label_example->l.cb);`
`145`	`145`	`label_example->l.cb = std::move(data.empty_label);`
`146`	`146`	`}`
	`147`	`+`
`147`	`148`	`multiline_learn_or_predict<false>(base, ec_seq, data.offset);`
`148`	`149`
`149`	`150`	`if (label_example != nullptr) // restore label`
`@@ -159,11 +160,18 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e`
`159`	`160`	`VW::action_scores& a_s = ec_seq[0]->pred.a_s;`
`160`	`161`
`161`	`162`	`float action_probability = 0;`
	`163`	`+ bool action_found = false;`
`162`	`164`	`for (size_t i = 0; i < a_s.size(); i++)`
`163`	`165`	`{`
`164`		`- if (data.known_cost.action == a_s[i].action) { action_probability = a_s[i].score; }`
	`166`	`+ if (data.known_cost.action == a_s[i].action)`
	`167`	`+ {`
	`168`	`+ action_probability = a_s[i].score;`
	`169`	`+ action_found = true;`
	`170`	`+ }`
`165`	`171`	`}`
`166`	`172`
	`173`	`+ if (!action_found) { return; }`
	`174`	`+`
`167`	`175`	`float threshold = action_probability / data.known_cost.probability;`
`168`	`176`
`169`	`177`	`if (!data.fixed_multiplier) { data.multiplier = std::min(data.multiplier, 1 / threshold); }`
`@@ -183,15 +191,18 @@ void do_actual_learning(explore_eval& data, multi_learner& base, VW::multi_ex& e`
`183`	`191`	`{ ec_found = ec; }`
`184`	`192`	`if (threshold > 1) { ec->weight *= threshold; }`
`185`	`193`	`}`
	`194`	`+`
`186`	`195`	`ec_found->l.cb.costs[0].probability = action_probability;`
`187`	`196`
`188`	`197`	`multiline_learn_or_predict<true>(base, ec_seq, data.offset);`
`189`	`198`
	`199`	`+ // restore logged example`
`190`	`200`	`if (threshold > 1)`
`191`	`201`	`{`
`192`	`202`	`float inv_threshold = 1.f / threshold;`
`193`	`203`	`for (auto& ec : ec_seq) { ec->weight *= inv_threshold; }`
`194`	`204`	`}`
	`205`	`+`
`195`	`206`	`ec_found->l.cb.costs[0].probability = data.known_cost.probability;`
`196`	`207`	`data.update_count++;`
`197`	`208`	`}`