-
Notifications
You must be signed in to change notification settings - Fork 149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Metrics framework integration with ml-commons #3661
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ | |
import org.opensearch.ml.common.model.MetricsCorrelationModelConfig; | ||
import org.opensearch.ml.common.model.QuestionAnsweringModelConfig; | ||
import org.opensearch.ml.common.model.TextEmbeddingModelConfig; | ||
import org.opensearch.telemetry.metrics.tags.Tags; | ||
|
||
import lombok.Builder; | ||
import lombok.Getter; | ||
|
@@ -745,4 +746,15 @@ public static MLModel fromStream(StreamInput in) throws IOException { | |
return new MLModel(in); | ||
} | ||
|
||
public Tags getModelTags() { | ||
return Tags | ||
.create() | ||
.addTag("type", algorithm == FunctionName.REMOTE ? "remote" : "local") | ||
.addTag("provider", algorithm == FunctionName.REMOTE ? getRemoteModelType() : algorithm.name()); | ||
} | ||
|
||
private String getRemoteModelType() { | ||
return "remote_sub_tye"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo - "tye -> type" |
||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,17 +8,27 @@ | |
import static org.opensearch.ml.plugin.MachineLearningPlugin.GENERAL_THREAD_POOL; | ||
import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_SYNC_UP_JOB_INTERVAL_IN_SECONDS; | ||
|
||
import java.io.IOException; | ||
import java.time.Instant; | ||
import java.time.temporal.ChronoUnit; | ||
import java.util.List; | ||
|
||
import org.opensearch.action.index.IndexRequest; | ||
import org.opensearch.action.support.WriteRequest; | ||
import org.opensearch.cluster.LocalNodeClusterManagerListener; | ||
import org.opensearch.cluster.service.ClusterService; | ||
import org.opensearch.common.lifecycle.LifecycleListener; | ||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.common.unit.TimeValue; | ||
import org.opensearch.common.xcontent.json.JsonXContent; | ||
import org.opensearch.core.action.ActionListener; | ||
import org.opensearch.jobscheduler.spi.schedule.IntervalSchedule; | ||
import org.opensearch.ml.autoredeploy.MLModelAutoReDeployer; | ||
import org.opensearch.ml.common.CommonValue; | ||
import org.opensearch.ml.engine.encryptor.Encryptor; | ||
import org.opensearch.ml.engine.indices.MLIndicesHandler; | ||
import org.opensearch.ml.jobs.MLJobParameter; | ||
import org.opensearch.ml.jobs.MLJobType; | ||
import org.opensearch.ml.settings.MLFeatureEnabledSetting; | ||
import org.opensearch.remote.metadata.client.SdkClient; | ||
import org.opensearch.threadpool.Scheduler; | ||
|
@@ -95,6 +105,40 @@ public void onClusterManager() { | |
TimeValue.timeValueSeconds(jobInterval), | ||
GENERAL_THREAD_POOL | ||
); | ||
// startStatsCollectorJob(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it commented intentionally ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a draft PR, it is not complete, still in development - raised a PR just to show the structure of adding metrics to ml-commons There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh ok |
||
} | ||
|
||
public void startStatsCollectorJob() { | ||
try { | ||
int intervalInMinutes = 5; | ||
Long lockDurationSeconds = 20L; | ||
|
||
MLJobParameter jobParameter = new MLJobParameter( | ||
MLJobType.STATS_COLLECTOR.name(), | ||
new IntervalSchedule(Instant.now(), intervalInMinutes, ChronoUnit.MINUTES), | ||
lockDurationSeconds, | ||
null, | ||
MLJobType.STATS_COLLECTOR | ||
); | ||
|
||
IndexRequest indexRequest = new IndexRequest() | ||
.index(CommonValue.ML_JOBS_INDEX) | ||
.id(MLJobType.STATS_COLLECTOR.name()) | ||
.source(jobParameter.toXContent(JsonXContent.contentBuilder(), null)) | ||
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); | ||
|
||
client | ||
.index( | ||
indexRequest, | ||
ActionListener | ||
.wrap( | ||
r -> log.info("Indexed ml stats collection job successfully"), | ||
e -> log.error("Failed to index stats collection job", e) | ||
) | ||
); | ||
} catch (IOException e) { | ||
log.error("Failed to index stats collection job", e); | ||
} | ||
} | ||
|
||
private void startSyncModelRoutingCron() { | ||
|
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We already have multiple system index. To avoid adding too many system index, can we reuse .plugins-ml-task index ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the job scheduler monitors a particular index for its documents. If a new document is added, it starts a new job using certain parameters in document defined in MLJobParameter. If the index has different documents with different format, I'm not sure how this will react. At the same time, how will the existing tasks work if job scheduler documents are present in it? I can test this out and get back to you