[Ready for Review] optimize from_pretrained APIs for decoupling of weights and configs (PaddlePaddle#671)

yingyibiao · web-flow · commit d66300bdaabc · 2021-08-02T12:56:24.000+08:00
* api

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update for model_utils
diff --git a/community/yingyibiao/bert-base-uncased-sst-2-finetuned/README.md b/community/yingyibiao/bert-base-uncased-sst-2-finetuned/README.md
diff --git a/community/yingyibiao/bert-base-uncased-sst-2-finetuned/files.json b/community/yingyibiao/bert-base-uncased-sst-2-finetuned/files.json
@@ -0,0 +1,6 @@
+{
+  "model_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/yingyibiao/bert-base-uncased-sst-2-finetuned/model_config.json",
+  "model_state": "https://paddlenlp.bj.bcebos.com/models/transformers/community/yingyibiao/bert-base-uncased-sst-2-finetuned/model_state.pdparams",
+  "tokenizer_config_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/yingyibiao/bert-base-uncased-sst-2-finetuned/tokenizer_config.json",
+  "vocab_file": "https://paddlenlp.bj.bcebos.com/models/transformers/community/yingyibiao/bert-base-uncased-sst-2-finetuned/vocab.txt"
+}
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -23,7 +23,7 @@
 import paddle
 from paddle.nn import Layer
 # TODO(fangzeyang) Temporary fix and replace by paddle framework downloader later
-from paddlenlp.utils.downloader import get_path_from_url
+from paddlenlp.utils.downloader import get_path_from_url, COMMUNITY_MODEL_PREFIX
 from paddlenlp.utils.env import MODEL_HOME
 from paddlenlp.utils.log import logger
 
@@ -105,7 +105,7 @@ class is a pretrained model class adding layers on top of the base model,
     """
     model_config_file = "model_config.json"
     pretrained_init_configuration = {}
-    # TODO: more flexible resource handle, namedtuple with fileds as:
+    # TODO: more flexible resource handle, namedtuple with fields as:
     # resource_name, saved_file, handle_name_for_load(None for used as __init__
     # arguments), handle_name_for_save
     resource_files_names = {"model_state": "model_state.pdparams"}
@@ -115,7 +115,7 @@ class is a pretrained model class adding layers on top of the base model,
     def _wrap_init(self, original_init, *args, **kwargs):
         """
         It would be hooked after `__init__` to add a dict including arguments of
-        `__init__` as a attribute named `config` of the prtrained model instance.
+        `__init__` as a attribute named `config` of the pretrained model instance.
         """
         init_dict = fn_args_to_dict(original_init, *((self, ) + args), **kwargs)
         self.config = init_dict
@@ -135,6 +135,7 @@ def model_name_list(self):
         list: Contains all supported built-in pretrained model names of the
             current PretrainedModel class.
         """
+        # Todo: return all model name
         return list(self.pretrained_init_configuration.keys())
 
     def get_input_embeddings(self):
@@ -150,14 +151,18 @@ def get_output_embeddings(self):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """
-        Creates an instance of `PretrainedModel` and load pretrained model weights
-        for it according to a specific model name (such as `bert-base-uncased`)
+        Creates an instance of `PretrainedModel`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
         or a local file directory path.
 
         Args:
-            pretrained_model_name_or_path (str): Name of pretrained model
-                for built-in pretrained models loading, such as `bert-base-uncased`.
-                Or a local file directory path for local trained models loading.
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of a built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains model weights file("model_state.pdparams")
+                  and model config file ("model_config.json").
             *args (tuple): Position arguments for model `__init__`. If provided,
                 use these as position argument values for model initialization.
             **kwargs (dict): Keyword arguments for model `__init__`. If provided,
@@ -174,38 +179,47 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
 
                 from paddlenlp.transformers import BertForSequenceClassification
 
+                # Name of built-in pretrained model
                 model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+                # Name of community-contributed pretrained model
+                model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+
+                # Load from local directory path
+                model = BertForSequenceClassification.from_pretrained('./my_bert/')
         """
         pretrained_models = list(cls.pretrained_init_configuration.keys())
         resource_files = {}
         init_configuration = {}
 
+        # From built-in pretrained models
         if pretrained_model_name_or_path in pretrained_models:
             for file_id, map_list in cls.pretrained_resource_files_map.items():
                 resource_files[file_id] = map_list[
                     pretrained_model_name_or_path]
             init_configuration = copy.deepcopy(
                 cls.pretrained_init_configuration[
                     pretrained_model_name_or_path])
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            for file_id, file_name in cls.resource_files_names.items():
+                full_file_name = os.path.join(pretrained_model_name_or_path,
+                                              file_name)
+                resource_files[file_id] = full_file_name
+            resource_files["model_config_file"] = os.path.join(
+                pretrained_model_name_or_path, cls.model_config_file)
         else:
-            if os.path.isdir(pretrained_model_name_or_path):
-                for file_id, file_name in cls.resource_files_names.items():
-                    full_file_name = os.path.join(pretrained_model_name_or_path,
-                                                  file_name)
-                    resource_files[file_id] = full_file_name
-                resource_files["model_config_file"] = os.path.join(
-                    pretrained_model_name_or_path, cls.model_config_file)
-            else:
-                raise ValueError(
-                    "Calling {}.from_pretrained() with a model identifier or the "
-                    "path to a directory instead. The supported model "
-                    "identifiers are as follows: {}, but got: {}".format(
-                        cls.__name__,
-                        cls.pretrained_init_configuration.keys(
-                        ), pretrained_model_name_or_path))
+            # Assuming from community-contributed pretrained models
+            for file_id, file_name in cls.resource_files_names.items():
+                full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX,
+                                              pretrained_model_name_or_path,
+                                              file_name)
+                resource_files[file_id] = full_file_name
+            resource_files["model_config_file"] = os.path.join(
+                COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path,
+                cls.model_config_file)
 
         default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path)
-
         resolved_resource_files = {}
         for file_id, file_path in resource_files.items():
             path = os.path.join(default_root, file_path.split('/')[-1])
@@ -217,8 +231,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             else:
                 logger.info("Downloading %s and saved to %s" %
                             (file_path, default_root))
-                resolved_resource_files[file_id] = get_path_from_url(
-                    file_path, default_root)
+                try:
+                    resolved_resource_files[file_id] = get_path_from_url(
+                        file_path, default_root)
+                except RuntimeError as err:
+                    logger.error(err)
+                    raise RuntimeError(
+                        f"Can't load weights for '{pretrained_model_name_or_path}'.\n"
+                        f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                        "- a correct model-identifier of built-in pretrained models,\n"
+                        "- or a correct model-identifier of community-contributed pretrained models,\n"
+                        "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n"
+                    )
 
         # Prepare model initialization kwargs
         # Did we saved some inputs and kwargs to reload ?
@@ -292,7 +316,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             model = cls(*derived_args, **derived_kwargs)
 
         # Maybe need more ways to load resources.
-        weight_path = list(resolved_resource_files.values())[0]
+        weight_path = resolved_resource_files["model_state"]
         assert weight_path.endswith(
             ".pdparams"), "suffix of weight must be .pdparams"
         state_dict = paddle.load(weight_path)
diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
@@ -24,8 +24,9 @@
 from shutil import copyfile
 from typing import Iterable, Iterator, Optional, List, Any, Callable, Union
 
-from paddlenlp.utils.downloader import get_path_from_url
+from paddlenlp.utils.downloader import get_path_from_url, COMMUNITY_MODEL_PREFIX
 from paddlenlp.utils.env import MODEL_HOME
+from paddlenlp.utils.log import logger
 
 from ..data.vocab import Vocab
 from .utils import InitTrackerMeta, fn_args_to_dict
@@ -408,14 +409,18 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """
-        Creates an instance of `PretrainedTokenizer` and loads related resources
-        (such as vocabulary file) according to a specific model name
-        (such as `bert-base-uncased`) or a local file directory path.
+        Creates an instance of `PretrainedTokenizer`. Related resources are loaded
+        by specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
 
         Args:
-            pretrained_model_name_or_path (str): A name of pretrained model
-                for built-in tokenizers loading, such as `bert-base-uncased`.
-                Or a local file directory path for local tokenizers loading.
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains tokenizer related resources
+                  and tokenizer config file ("tokenizer_config.json").
             *args (tuple): position arguments for model `__init__`. If provided,
                 use these as position argument values for tokenizer initialization.
             **kwargs (dict): keyword arguments for model `__init__`. If provided,
@@ -430,39 +435,68 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
 
                 from paddlenlp.transformers import BertTokenizer
 
+                # Name of built-in pretrained model
                 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+                # Name of community-contributed pretrained model
+                tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+
+                # Load from local directory path
+                tokenizer = BertTokenizer.from_pretrained('./my_bert/')
         """
         pretrained_models = list(cls.pretrained_init_configuration.keys())
         vocab_files = {}
         init_configuration = {}
+        # From built-in pretrained models
         if pretrained_model_name_or_path in pretrained_models:
             for file_id, map_list in cls.pretrained_resource_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
             init_configuration = copy.deepcopy(
                 cls.pretrained_init_configuration[
                     pretrained_model_name_or_path])
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            for file_id, file_name in cls.resource_files_names.items():
+                full_file_name = os.path.join(pretrained_model_name_or_path,
+                                              file_name)
+                vocab_files[file_id] = full_file_name
+            vocab_files["tokenizer_config_file"] = os.path.join(
+                pretrained_model_name_or_path, cls.tokenizer_config_file)
         else:
-            if os.path.isdir(pretrained_model_name_or_path):
-                for file_id, file_name in cls.resource_files_names.items():
-                    full_file_name = os.path.join(pretrained_model_name_or_path,
-                                                  file_name)
-                    vocab_files[file_id] = full_file_name
-                vocab_files["tokenizer_config_file"] = os.path.join(
-                    pretrained_model_name_or_path, cls.tokenizer_config_file)
-            else:
-                raise ValueError(
-                    "Calling {}.from_pretrained() with a model identifier or the "
-                    "path to a directory instead. The supported model "
-                    "identifiers are as follows: {}".format(
-                        cls.__name__, cls.pretrained_init_configuration.keys()))
+            # Assuming from community-contributed pretrained models
+            for file_id, file_name in cls.resource_files_names.items():
+                full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX,
+                                              pretrained_model_name_or_path,
+                                              file_name)
+                vocab_files[file_id] = full_file_name
+            vocab_files["tokenizer_config_file"] = os.path.join(
+                COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path,
+                cls.tokenizer_config_file)
 
         default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path)
         resolved_vocab_files = {}
         for file_id, file_path in vocab_files.items():
-            resolved_vocab_files[
-                file_id] = file_path if file_path is None or os.path.isfile(
-                    file_path) else get_path_from_url(file_path, default_root,
-                                                      None)
+            path = os.path.join(default_root, file_path.split('/')[-1])
+            if file_path is None or os.path.isfile(file_path):
+                resolved_vocab_files[file_id] = file_path
+            elif os.path.exists(path):
+                logger.info("Already cached %s" % path)
+                resolved_vocab_files[file_id] = path
+            else:
+                logger.info("Downloading %s and saved to %s" %
+                            (file_path, default_root))
+                try:
+                    resolved_vocab_files[file_id] = get_path_from_url(
+                        file_path, default_root)
+                except RuntimeError as err:
+                    logger.error(err)
+                    raise RuntimeError(
+                        f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n"
+                        f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                        "- a correct model-identifier of built-in pretrained models,\n"
+                        "- or a correct model-identifier of community-contributed pretrained models,\n"
+                        "- or the correct path to a directory containing relevant tokenizer files.\n"
+                    )
 
         # Prepare tokenizer initialization kwargs
         # Did we saved some inputs and kwargs to reload ?
diff --git a/paddlenlp/utils/downloader.py b/paddlenlp/utils/downloader.py
@@ -60,6 +60,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 __all__ = ['get_weights_path_from_url']
 
+COMMUNITY_MODEL_PREFIX = "https://paddlenlp.bj.bcebos.com/models/transformers/community/"
+
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 
 DOWNLOAD_RETRY_LIMIT = 3