Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions examples/Advanced/huggingface_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Hugging Face Integration Tutorial
=================================

This example demonstrates how to use the experimental Hugging Face integration
to push models to the Hugging Face Hub and link them to OpenML runs.

Requirements:
pip install openml[huggingface]
or
pip install huggingface_hub transformers
"""
import logging
import sys

import openml
from openml.extensions.huggingface_integration import (
push_model_to_hub_for_run,
load_model_from_run,
run_task_with_hf_sync,
is_hf_transformer
)

# Configure logging
logging.basicConfig(level=logging.INFO)

def main():
# Check if HF dependencies are available
try:
import transformers
from transformers import AutoModel, AutoConfig
except ImportError:
print("This example requires 'transformers' and 'huggingface_hub'.")
print("Please install them with: pip install openml[huggingface]")
sys.exit(0)

print("Hugging Face integration is available.")

# 1. Create a dummy model (or load one)
# For demonstration, we'll create a tiny random model
config = AutoConfig.from_pretrained("bert-base-uncased")
config.num_hidden_layers = 1
config.hidden_size = 32
config.num_attention_heads = 2
config.vocab_size = 100

model = AutoModel.from_config(config)

if is_hf_transformer(model):
print("Model is recognized as a Hugging Face transformer.")

# 2. Setup a dummy run (in a real scenario, you would run a task)
# Here we just simulate a run object
run = openml.runs.OpenMLRun(task_id=1, flow_id=1, dataset_id=1)
run.run_id = 12345 # Fake run ID

# 3. Push model to Hub
# NOTE: You need to be logged in to Hugging Face Hub or provide a token.
# You can login with `huggingface-cli login`

repo_id = "your-username/openml-test-model" # CHANGE THIS

print(f"\nAttempting to push to {repo_id}...")
print("Note: This will fail if you don't have write access to the repo or aren't logged in.")

try:
# We pass a token=None to use the locally stored token
run = push_model_to_hub_for_run(model, run, repo_id=repo_id)

print("\nRun tags after push:")
print(run.tags)

# 4. Load model back
print("\nLoading model back from run...")
loaded_model = load_model_from_run(run.run_id)
print(f"Loaded model: {type(loaded_model)}")

except Exception as e:
print(f"\nSkipping actual push/load in this tutorial due to error (likely auth): {e}")
print("To run the full example, ensure you are logged in to HF Hub and set a valid repo_id.")

# 5. Convenience wrapper usage
# run = run_task_with_hf_sync(model, task_id=31, repo_id=repo_id)

if __name__ == "__main__":
main()
169 changes: 169 additions & 0 deletions openml/extensions/huggingface_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any

import openml

if TYPE_CHECKING:
from openml.runs import OpenMLRun

logger = logging.getLogger(__name__)

try:
from huggingface_hub import HfApi
from transformers import AutoModel, PreTrainedModel

_HF_AVAILABLE = True
except ImportError:
_HF_AVAILABLE = False
PreTrainedModel = object # type: ignore


def is_hf_transformer(model: Any) -> bool:
"""Check if a model is a Hugging Face Transformers model."""
if not _HF_AVAILABLE:
return False
return isinstance(model, PreTrainedModel)


def push_model_to_hub_for_run(
model: Any,
run: OpenMLRun,
repo_id: str,
token: str | None = None,
) -> OpenMLRun:
"""
Push a Hugging Face model to the Hub and link it to an OpenML run.

If the model is not a Hugging Face model, the run is returned unchanged.
If the model is a Hugging Face model, it is pushed to the Hub, and a tag
referencing the commit is added to the run.

Parameters
----------
model : Any
The model to push.
run : OpenMLRun
The OpenML run to link.
repo_id : str
The ID of the repository to push to (e.g. "username/repo_name").
token : str, optional
The Hugging Face authentication token.

Returns
-------
OpenMLRun
The updated OpenML run.
"""
if not is_hf_transformer(model):
return run

if not _HF_AVAILABLE:
# Should be unreachable if is_hf_transformer works correctly,
# but good for safety if logic changes.
logger.warning("Hugging Face integration dependencies not found. Skipping push.")
return run

# 1. Push to Hub
model.push_to_hub(repo_id, commit_message=f"OpenML Run {run.run_id}", token=token)

# 2. Get latest commit
api = HfApi(token=token)
commit_sha = api.list_repo_commits(repo_id)[0].commit_id

# 3. Construct URI
# Format: hf://{user_or_org}/{repo_name}@{commit_sha}
hf_uri = f"hf://{repo_id}@{commit_sha}"

# 4. Store URI in tags
run.tags.append(f"hf_uri={hf_uri}")
run.tags.append("hf-integrated")

return run


def load_model_from_run(
run_id: int,
token: str | None = None,
) -> Any:
"""
Load a Hugging Face model linked to an OpenML run.

Parameters
----------
run_id : int
The ID of the OpenML run.
token : str, optional
The Hugging Face authentication token.

Returns
-------
Any
The loaded Hugging Face model.

Raises
------
ImportError
If Hugging Face dependencies are not installed.
ValueError
If the run does not have a linked Hugging Face model.
"""
if not _HF_AVAILABLE:
raise ImportError("Hugging Face integration requires 'huggingface_hub' and 'transformers'.")

run = openml.runs.get_run(run_id)

hf_uri = None
for tag in run.tags:
if tag.startswith("hf_uri="):
hf_uri = tag.split("=", 1)[1]
break

if not hf_uri:
raise ValueError(
f"Run {run_id} does not have a linked Hugging Face model (no 'hf_uri' tag)."
)

# Parse URI: hf://{repo_id}@{commit_sha}
# Remove hf://
uri_path = hf_uri[5:]
if "@" not in uri_path:
raise ValueError(f"Invalid HF URI format: {hf_uri}")

repo_id, commit_sha = uri_path.split("@", 1)

# Load model
return AutoModel.from_pretrained(repo_id, revision=commit_sha, token=token)


def run_task_with_hf_sync(
model: Any,
task_id: int,
repo_id: str,
hf_token: str | None = None,
) -> OpenMLRun:
"""
Run a task and sync the model to Hugging Face Hub.

Parameters
----------
model : Any
The model to run.
task_id : int
The ID of the task to run.
repo_id : str
The Hugging Face repository ID to push to.
hf_token : str, optional
The Hugging Face authentication token.

Returns
-------
OpenMLRun
The published OpenML run.
"""
task = openml.tasks.get_task(task_id)
run = openml.runs.run_model_on_task(model, task)
run = push_model_to_hub_for_run(model, run, repo_id=repo_id, token=hf_token)
run.publish()
return run
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ docs=[
"mike"
]

huggingface=[
"huggingface_hub",
"transformers"
]

[project.urls]
home="https://openml.org/"
documentation = "https://openml.github.io/openml-python/"
Expand Down
116 changes: 116 additions & 0 deletions tests/test_huggingface_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import unittest
from unittest.mock import MagicMock, patch
import pytest
import sys

# Mock modules if they don't exist, so we can test the logic
# This needs to be done before importing the integration module if we want to force-enable it
# But the integration module does a try-import.

from openml.runs import OpenMLRun
import openml.extensions.huggingface_integration as hf_int

class TestHuggingFaceIntegration(unittest.TestCase):

def setUp(self):
self.run = OpenMLRun(task_id=1, flow_id=1, dataset_id=1)
self.run.run_id = 123
self.run.tags = []

def test_is_hf_transformer_no_deps(self):
# Force _HF_AVAILABLE to False
with patch("openml.extensions.huggingface_integration._HF_AVAILABLE", False):
self.assertFalse(hf_int.is_hf_transformer(MagicMock()))

def test_push_model_no_deps(self):
with patch("openml.extensions.huggingface_integration._HF_AVAILABLE", False):
model = MagicMock()
run = hf_int.push_model_to_hub_for_run(model, self.run, "repo")
self.assertEqual(run.tags, [])

def test_load_model_no_deps(self):
with patch("openml.extensions.huggingface_integration._HF_AVAILABLE", False):
with self.assertRaises(ImportError):
hf_int.load_model_from_run(123)

@patch("openml.extensions.huggingface_integration._HF_AVAILABLE", True)
def test_is_hf_transformer_with_deps(self):
# We need to mock PreTrainedModel in the module
MockPTM = MagicMock()
with patch("openml.extensions.huggingface_integration.PreTrainedModel", MockPTM):
model = MockPTM()
# isinstance check in the module needs to work.
# Since we patched the name 'PreTrainedModel' in the module,
# if we create an instance of that mock, isinstance might not work as expected
# if the module uses the *real* class it imported (or failed to import).

# If the module successfully imported PreTrainedModel, it holds a reference to the real class.
# If it failed, it holds 'object'.

# If we want to test the True path, we should rely on the module's reference.
pass
# This is getting complicated to test "with deps" if they aren't actually there.
# I will rely on the fact that if they are there, we test it.
# If not, we skip the "with deps" tests.

@pytest.mark.skipif(not hf_int._HF_AVAILABLE, reason="Hugging Face dependencies not installed")
class TestHuggingFaceIntegrationWithDeps(unittest.TestCase):

def setUp(self):
self.run = OpenMLRun(task_id=1, flow_id=1, dataset_id=1)
self.run.run_id = 123
self.run.tags = []

def test_is_hf_transformer(self):
from transformers import PreTrainedModel
# Create a dummy subclass
class DummyModel(PreTrainedModel):
def __init__(self):
# Minimal init to satisfy PreTrainedModel if needed,
# but usually we can just mock or pass dummy config
self.config = MagicMock()

model = DummyModel()
self.assertTrue(hf_int.is_hf_transformer(model))
self.assertFalse(hf_int.is_hf_transformer("string"))

@patch("openml.extensions.huggingface_integration.HfApi")
def test_push_model_to_hub_for_run(self, MockHfApi):
from transformers import PreTrainedModel

model = MagicMock(spec=PreTrainedModel)

# Mock HfApi
mock_api = MockHfApi.return_value
mock_commit = MagicMock()
mock_commit.commit_id = "sha123"
mock_api.list_repo_commits.return_value = [mock_commit]

run = hf_int.push_model_to_hub_for_run(model, self.run, "user/repo")

model.push_to_hub.assert_called_with("user/repo", commit_message="OpenML Run 123", token=None)
self.assertIn("hf_uri=hf://user/repo@sha123", run.tags)
self.assertIn("hf-integrated", run.tags)

@patch("openml.extensions.huggingface_integration.AutoModel")
@patch("openml.runs.get_run")
def test_load_model_from_run(self, mock_get_run, MockAutoModel):
self.run.tags = ["hf_uri=hf://user/repo@sha123"]
mock_get_run.return_value = self.run

hf_int.load_model_from_run(123)

MockAutoModel.from_pretrained.assert_called_with("user/repo", revision="sha123", token=None)

@patch("openml.runs.get_run")
def test_load_model_from_run_missing_tag(self, mock_get_run):
mock_get_run.return_value = self.run
with self.assertRaises(ValueError):
hf_int.load_model_from_run(123)

@patch("openml.runs.get_run")
def test_load_model_from_run_bad_uri(self, mock_get_run):
self.run.tags = ["hf_uri=hf://bad_uri"]
mock_get_run.return_value = self.run
with self.assertRaises(ValueError):
hf_int.load_model_from_run(123)