diff --git a/pytorch_tokenizers/tools/__init__.py b/pytorch_tokenizers/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pytorch_tokenizers/tools/llama2c/convert.py b/pytorch_tokenizers/tools/llama2c/convert.py index 7b538ae..963624f 100644 --- a/pytorch_tokenizers/tools/llama2c/convert.py +++ b/pytorch_tokenizers/tools/llama2c/convert.py @@ -11,7 +11,7 @@ import argparse -from pytorch_tokenizers import Llama2cTokenizer +from pytorch_tokenizers.llama2c import Llama2cTokenizer if __name__ == "__main__": diff --git a/pytorch_tokenizers/tools/llama2c/targets.bzl b/pytorch_tokenizers/tools/llama2c/targets.bzl index f8b5332..b5f61e6 100644 --- a/pytorch_tokenizers/tools/llama2c/targets.bzl +++ b/pytorch_tokenizers/tools/llama2c/targets.bzl @@ -24,6 +24,9 @@ def define_common_targets(): external_deps = [ "sentencepiece-py", ], + deps = [ + "//pytorch/tokenizers/pytorch_tokenizers:tokenizers", + ], ) runtime.python_binary( diff --git a/setup.py b/setup.py index 99bfb5a..cfa4b2d 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,5 @@ version="0.1.0", long_description=long_description, long_description_content_type="text/markdown", - packages=find_packages(where="pytorch_tokenizers"), - package_dir={"": "pytorch_tokenizers"}, + packages=find_packages(), ) diff --git a/tools/llama2c/convert.py b/tools/llama2c/convert.py deleted file mode 100644 index 4627012..0000000 --- a/tools/llama2c/convert.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -# @lint-ignore-every LICENSELINT - -# Script to rewrite tokenizer model given by sentencepiece to llama2.c format, with lightweight -# postprocessing logic. The output can be consumed by llama2c_tokenizer.cpp. - -import argparse - -from pytorch_tokenizers.llama2c import Llama2cTokenizer - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "-t", - "--tokenizer-model", - type=str, - default="tokenizer.model", - help="path to tokenizer model, given by sentencepiece", - ) - parser.add_argument( - "-o", - "--output-path", - type=str, - default=None, - help="output path of postprocessed tokenizer model", - ) - parser.add_argument( - "-p", - "--prepend-padding", - action="store_true", - help="whether to prepend a padding token to the beginning of the tokenizer", - ) - - args = parser.parse_args() - - t = Llama2cTokenizer(args.tokenizer_model) - - output_path = ( - args.output_path - if args.output_path - else args.tokenizer_model.replace(".model", ".bin") - ) - t.export(output_path, prepend_padding=args.prepend_padding) - - -if __name__ == "__main__": - main() diff --git a/tools/llama2c/targets.bzl b/tools/llama2c/targets.bzl deleted file mode 100644 index 6ef87d1..0000000 --- a/tools/llama2c/targets.bzl +++ /dev/null @@ -1,32 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.python_library( - name = "lib", - srcs = [ - "__init__.py", - "convert.py", - ], - deps = [ - "//pytorch/tokenizers/pytorch_tokenizers:tokenizers", - ], - ) - - runtime.python_binary( - name = "convert", - main_module = "pytorch.tokenizers.tools.llama2c.convert", - visibility = [ - "//executorch/examples/...", - "fbsource//xplat/executorch/examples/...", - ], - _is_external_target = True, - deps = [ - ":lib", - ], - )