Skip to content

Commit 9dd9718

Browse files
authored
Fix restart HCAD detector bug (opensearch-project#460)
* Fix restart HCAD detector bug To prevent repeatedly cold starting a model due to sparse data, HCAD has a cache that remembers we have done cold start for a model. A second attempt to cold start will need to wait for 60 detector intervals. Previously, when stopping a detector, I forgot to clean the cache. So the cache remembers the model and won’t retry cold start after some time. This PR fixes the bug by cleaning the cache when stopping a detector. Testing done: 1. added unit and integration tests. 2. manually reproduced the issue and verified the fix. Signed-off-by: Kaituo Li <[email protected]>
1 parent 3bdb4f6 commit 9dd9718

File tree

5 files changed

+407
-25
lines changed

5 files changed

+407
-25
lines changed

Diff for: src/main/java/org/opensearch/ad/ml/EntityColdStarter.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.opensearch.action.ActionListener;
3838
import org.opensearch.action.support.ThreadedActionListener;
3939
import org.opensearch.ad.AnomalyDetectorPlugin;
40+
import org.opensearch.ad.CleanState;
4041
import org.opensearch.ad.MaintenanceState;
4142
import org.opensearch.ad.NodeStateManager;
4243
import org.opensearch.ad.caching.DoorKeeper;
@@ -63,7 +64,7 @@
6364
* Training models for HCAD detectors
6465
*
6566
*/
66-
public class EntityColdStarter implements MaintenanceState {
67+
public class EntityColdStarter implements MaintenanceState, CleanState {
6768
private static final Logger logger = LogManager.getLogger(EntityColdStarter.class);
6869
private final Clock clock;
6970
private final ThreadPool threadPool;
@@ -743,4 +744,9 @@ public void maintenance() {
743744
}
744745
});
745746
}
747+
748+
@Override
749+
public void clear(String detectorId) {
750+
doorKeepers.remove(detectorId);
751+
}
746752
}

Diff for: src/main/java/org/opensearch/ad/transport/DeleteModelTransportAction.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.opensearch.ad.NodeStateManager;
2424
import org.opensearch.ad.caching.CacheProvider;
2525
import org.opensearch.ad.feature.FeatureManager;
26+
import org.opensearch.ad.ml.EntityColdStarter;
2627
import org.opensearch.ad.ml.ModelManager;
2728
import org.opensearch.ad.task.ADTaskCacheManager;
2829
import org.opensearch.cluster.service.ClusterService;
@@ -39,6 +40,7 @@ public class DeleteModelTransportAction extends
3940
private FeatureManager featureManager;
4041
private CacheProvider cache;
4142
private ADTaskCacheManager adTaskCacheManager;
43+
private EntityColdStarter coldStarter;
4244

4345
@Inject
4446
public DeleteModelTransportAction(
@@ -50,7 +52,8 @@ public DeleteModelTransportAction(
5052
ModelManager modelManager,
5153
FeatureManager featureManager,
5254
CacheProvider cache,
53-
ADTaskCacheManager adTaskCacheManager
55+
ADTaskCacheManager adTaskCacheManager,
56+
EntityColdStarter coldStarter
5457
) {
5558
super(
5659
DeleteModelAction.NAME,
@@ -68,6 +71,7 @@ public DeleteModelTransportAction(
6871
this.featureManager = featureManager;
6972
this.cache = cache;
7073
this.adTaskCacheManager = adTaskCacheManager;
74+
this.coldStarter = coldStarter;
7175
}
7276

7377
@Override
@@ -121,6 +125,8 @@ protected DeleteModelNodeResponse nodeOperation(DeleteModelNodeRequest request)
121125

122126
cache.get().clear(adID);
123127

128+
coldStarter.clear(adID);
129+
124130
// delete realtime task cache
125131
adTaskCacheManager.removeRealtimeTaskCache(adID);
126132

0 commit comments

Comments
 (0)