Skip to content

Commit a5dc775

Browse files
[feat] upgrade torchrec to 1.1.0 (#99)
1 parent 3bee923 commit a5dc775

38 files changed

+376
-823
lines changed

.github/workflows/codestyle_ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-codestyle-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
1313
steps:
1414
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
1515
uses: actions/checkout@v2

.github/workflows/pytyping_ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-codestyle-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
1313
steps:
1414
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
1515
uses: actions/checkout@v2

.github/workflows/unittest_ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
1313
options: --gpus all --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}

.github/workflows/unittest_cpu_ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-cpu-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.6-cpu
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7-cpu
1313
options: --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}

.pre-commit-config.yaml

+3-4
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ repos:
66
files: \.py$
77
args: ["--license-filepath", "data/.license_header.txt", "--allow-past-years"]
88
- repo: https://github.com/astral-sh/ruff-pre-commit
9-
rev: v0.7.1
9+
rev: v0.8.6
1010
hooks:
1111
- id: ruff
1212
args: [ --fix ]
13-
exclude: tzrec/acc/_decompositions.py|tzrec/acc/_aten_lowering_pass.py
1413
- id: ruff-format
1514
- repo: https://github.com/pre-commit/pre-commit-hooks
1615
rev: v5.0.0
@@ -25,12 +24,12 @@ repos:
2524
- id: mixed-line-ending
2625
args: ["--fix=lf"]
2726
- repo: https://github.com/codespell-project/codespell
28-
rev: v2.3.0
27+
rev: v2.4.1
2928
hooks:
3029
- id: codespell
3130
args: ["--skip", "*.json", "-L", "TBE"]
3231
- repo: https://github.com/executablebooks/mdformat
33-
rev: 0.7.18
32+
rev: 0.7.22
3433
hooks:
3534
- id: mdformat
3635
additional_dependencies:

.pyre_configuration

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
"tzrec/*/*_test.py",
55
"tzrec/tests/*.py",
66
"tzrec/utils/load_class.py",
7-
"tzrec/acc/_*.py",
87
"tzrec/tools/convert_easyrec_config_to_tzrec_config.py",
9-
"tzrec/*/*_test_tmp.py"
8+
"tzrec/*/*_test_tmp.py",
9+
"tzrec/acc/export_utils.py"
1010
],
1111
"site_package_search_strategy": "all",
1212
"source_directories": [
@@ -16,5 +16,5 @@
1616
}
1717
],
1818
"strict": true,
19-
"version": "0.9.21"
19+
"version": "0.9.23"
2020
}

docker/Dockerfile

+6-6
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ ENV PATH /opt/conda/bin:$PATH
2626

2727
ARG DEVICE
2828
RUN case ${DEVICE} in \
29-
"cu121") pip install torch==2.5.0 fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cu121 && \
29+
"cu124") pip install torch==2.6.0 fbgemm-gpu==1.1.0 --index-url https://download.pytorch.org/whl/cu124 && \
3030
pip uninstall -y nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nvjitlink-cu12 nvidia-nvtx-cu12 && \
31-
sed -i '/^Requires-Dist: nvidia-/d' /opt/conda/lib/python3.11/site-packages/torch-2.5.0+cu121.dist-info/METADATA && \
32-
pip install torchmetrics==1.0.3 torch_tensorrt==2.5.0 && \
33-
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cu121 ;; \
34-
* ) pip install torch==2.5.0 fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cpu && \
31+
sed -i '/^Requires-Dist: nvidia-/d' /opt/conda/lib/python3.11/site-packages/torch-2.6.0+cu124.dist-info/METADATA && \
32+
pip install torchmetrics==1.0.3 torch_tensorrt==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 && \
33+
pip install torchrec==1.1.0 --index-url https://download.pytorch.org/whl/cu124 ;; \
34+
* ) pip install torch==2.6.0 fbgemm-gpu==1.1.0 --index-url https://download.pytorch.org/whl/cpu && \
3535
pip install torchmetrics==1.0.3 && \
36-
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cpu ;; \
36+
pip install torchrec==1.1.0 --index-url https://download.pytorch.org/whl/cpu ;; \
3737
esac && \
3838
/opt/conda/bin/conda clean -ya
3939

docs/source/feature/feature.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,8 @@ feature_configs: {
402402

403403
| 方式 | 描述 | 备注 |
404404
| ------------------ | --------------------------------------------- | ------------------------------ |
405-
| query_common_ratio | 计算query与title间重复term数占query中term比例 | 取值为\[0,1\] |
406-
| title_common_ratio | 计算query与title间重复term数占title中term比例 | 取值为\[0,1\] |
405+
| query_common_ratio | 计算query与title间重复term数占query中term比例 | 取值为[0,1] |
406+
| title_common_ratio | 计算query与title间重复term数占title中term比例 | 取值为[0,1] |
407407
| is_contain | 计算query是否全部包含在title中,保持顺序 | 0表示未包含,1表示包含 |
408408
| is_equal | 计算query是否与title完全相同 | 0表示不完全相同,1表示完全相同 |
409409

docs/source/models/deepfm.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ model_config {
5555
```
5656

5757
- feature_groups: 需要至少两个feature_group: wide和deep, fm可选
58-
- deepfm: deepfm相关的参数
58+
- deepfm: deepfm相关的参数
5959
- deep: deep mlp的参数配置
6060
- hidden_units: mlp每一层的channel数目,即神经元的数目
6161
- wide_embedding_dim: wide部分输出的大小

docs/source/models/dssm.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ model_config {
9797
- input: 输入feature_group名
9898
- hidden_units: mlp每一层的channel数目,即神经元的数目
9999
- output_dim: user/item输出embedding维度
100-
- similarity: 向量相似度函数,包括\[COSINE, INNER_PRODUCT\],默认INNER_PRODUCT
100+
- similarity: 向量相似度函数,包括[COSINE, INNER_PRODUCT],默认INNER_PRODUCT
101101
- dssm_v2: 参数同dssm
102102
- dssm_v2可以支持user与item塔 跨塔share embedding,但训练速度相对dssm_v1稍慢
103103
- 注意如果使用dssm_v2,data_config.force_base_data_group需要设置为true

docs/source/models/multi_tower.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ model_config {
5151
}
5252
```
5353

54-
- feature_groups: 可配置多个feature_group,group name可以变
54+
- feature_groups: 可配置多个feature_group,group name可以变
5555
- multi_tower: multi_tower相关的参数
5656
- towers: 每个deep feature_group对应了一个tower。
5757
- input: 跟feature_group的group_name对应

docs/source/quick_start/dlc_odps_dataset_tutorial.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ bash upload_data.sh ${ODPS_PROJECT_NAME}
3434

3535
进入[PAI控制台](https://pai.console.aliyun.com),并选择需要使用的工作空间,点击 **模型开发与训练-分布式训练(DLC)**,点击创建任务。
3636

37-
**节点镜像** 选择官方镜像`torcheasyrec:0.6.0-pytorch2.5.0-gpu-py311-cu121-ubuntu22.04`
37+
**节点镜像** 选择官方镜像`torcheasyrec:0.7.0-pytorch2.6.0-gpu-py311-cu124-ubuntu22.04`
3838

3939
**数据集配置** 选择刚新建的NAS数据集
4040

docs/source/quick_start/dlc_tutorial.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ pip index versions tzrec -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nig
3131

3232
进入[PAI控制台](https://pai.console.aliyun.com),并选择需要使用的工作空间,点击 **模型开发与训练-分布式训练(DLC)**,点击创建任务。
3333

34-
**节点镜像** 选择官方镜像`torcheasyrec:0.6.0-pytorch2.5.0-gpu-py311-cu121-ubuntu22.04`
34+
**节点镜像** 选择官方镜像`torcheasyrec:0.7.0-pytorch2.6.0-gpu-py311-cu124-ubuntu22.04`
3535

3636
**数据集配置** 选择刚新建的NAS数据集
3737

docs/source/quick_start/local_tutorial.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ pip index versions tzrec -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nig
1515
```bash
1616
conda create -n tzrec python=3.11
1717
conda activate tzrec
18-
pip install torch==2.5.0 --index-url https://download.pytorch.org/whl/cu121
19-
pip install fbgemm-gpu==1.0.0 --index-url https://download.pytorch.org/whl/cu121
18+
pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124
19+
pip install fbgemm-gpu==1.1.0 --index-url https://download.pytorch.org/whl/cu124
2020
pip install torchmetrics==1.0.3
21-
pip install torchrec==1.0.0 --index-url https://download.pytorch.org/whl/cu121
21+
pip install torchrec==1.1.0 --index-url https://download.pytorch.org/whl/cu124
2222
pip install tzrec==${TZREC_NIGHTLY_VERSION} -f http://tzrec.oss-cn-beijing.aliyuncs.com/release/nightly/repo.html --trusted-host tzrec.oss-cn-beijing.aliyuncs.com
2323
```
2424

@@ -33,8 +33,8 @@ pip install tzrec==${TZREC_NIGHTLY_VERSION} -f http://tzrec.oss-cn-beijing.aliyu
3333
注:
3434

3535
```
36-
GPU版本(CUDA 12.1) 镜像地址:
37-
mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:${TZREC_DOCKER_VERSION}-cu121
36+
GPU版本(CUDA 12.4) 镜像地址:
37+
mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:${TZREC_DOCKER_VERSION}-cu124
3838
CPU版本 镜像地址:
3939
mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:${TZREC_DOCKER_VERSION}-cpu
4040
```

docs/source/quick_start/local_tutorial_u2i_vec.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ OMP_NUM_THREADS=16 torchrun --master_addr=localhost --master_port=32555 \
126126
- --hitrate_details_output: (可选)hitrate详情输出表,会包含id | topk_ids | topk_dists
127127
| hitrate | hit_ids 五列
128128
- --batch_size: 评估batch_size,默认为1024
129-
- --index_type: 评估检索方式,默认为IVFFlatIP,可以选 \[IVFFlatIP, IVFFlatL2\]
129+
- --index_type: 评估检索方式,默认为IVFFlatIP,可以选 [IVFFlatIP, IVFFlatL2]
130130
- --top_k: 评估TopK召回的Hitrate,默认200
131131
- --ivf_nlist: IVFFlat索引的聚簇中心个数,默认为1000
132132
- --ivf_nprobe: IVFFlat索引的检索中心个数,默认为800
@@ -151,7 +151,7 @@ python -m tzrec.tools.create_faiss_index \
151151
- --embedding_input_path: 物品池向量表,需包含item_id | item_tower_emb两列
152152
- --index_output_dir: 物品池索引输出目录,一般指定用户塔目录,以保证模型版本和索引版本同时切换
153153
- --batch_size: 索引构建batch_size,默认为1024
154-
- --index_type: 评估检索方式,默认为IVFFlatIP,可以选 \[IVFFlatIP, HNSWFlatIP, IVFFlatL2, HNSWFlatL2\]
154+
- --index_type: 评估检索方式,默认为IVFFlatIP,可以选 [IVFFlatIP, HNSWFlatIP, IVFFlatL2, HNSWFlatL2]
155155
- --ivf_nlist: IVFFlat索引的聚簇中心个数,默认为1000
156156
- --hnsw_M: HNSWFlat索引的M参数
157157
- --hnsw_efConstruction: HNSWFlat索引的efConstruction参数

docs/source/usage/train.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ torchrun --master_addr=localhost --master_port=32555 \
1818
- --eval_input_path: 评估数据的输入路径
1919
- --continue_train: 是否增量训练
2020
- --fine_tune_checkpoint: 增量训练的checkpoint路径,如experiments/multi_tower_din_taobao_local/model.ckpt-0,如果不设置,增量训练使用model_dir下最近的检查点
21-
- --edit_config_json: 命令行以json的方式动态修改配置文件,如{"model_dir":"experiments/","feature_configs\[0\].raw_feature.boundaries":\[4,5,6,7\]}
21+
- --edit_config_json: 命令行以json的方式动态修改配置文件,如{"model_dir":"experiments/","feature_configs[0].raw_feature.boundaries":[4,5,6,7]}
2222

2323
### 环境变量
2424

requirements/gpu.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
torch-tensorrt @ http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/trt/torch_tensorrt-2.5.0a0-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
2-
torch-tensorrt @ http://tzrec.oss-cn-beijing.aliyuncs.com/third_party/trt/torch_tensorrt-2.5.0a0-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
1+
torch-tensorrt==2.6.0

requirements/runtime.txt

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@ alibabacloud_credentials
22
anytree
33
common_io @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/common_io-0.4.1%2Btunnel-py2.py3-none-any.whl
44
faiss-cpu
5-
fbgemm-gpu==1.0.0
5+
fbgemm-gpu==1.1.0
66
graphlearn @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/graphlearn-1.3.3-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
77
graphlearn @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/graphlearn-1.3.3-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
88
grpcio-tools<1.63.0
9+
numpy<2
910
pandas
1011
pyfg @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-0.4.4-cp311-cp311-linux_x86_64.whl ; python_version=="3.11"
1112
pyfg @ https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/pyfg-0.4.4-cp310-cp310-linux_x86_64.whl ; python_version=="3.10"
1213
pyodps>=0.12.0
1314
scikit-learn
1415
tensorboard
15-
torch==2.5.0
16+
torch==2.6.0
1617
torchmetrics==1.0.3
17-
torchrec==1.0.0
18+
torchrec==1.1.0

requirements/test.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
parameterized
22
pre-commit
3-
pyre-check==0.9.21
3+
pyre-check==0.9.23

scripts/build_docker.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
#!/usr/bin/env bash
22

33
REGISTRY=mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec
4-
DOCKER_TAG=0.6
4+
DOCKER_TAG=0.7
55

66
cp requirements.txt docker/
77
rm -rf docker/requirements
88
cp -r requirements/ docker/requirements
99
cd docker
1010

11-
for DEVICE in cu121 cpu
11+
for DEVICE in cu124 cpu
1212
do
1313
case ${DEVICE} in
14-
"cu121") BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 ;;
14+
"cu124") BASE_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04 ;;
1515
* ) BASE_IMAGE=ubuntu:22.04 ;;
1616
esac
1717
docker build --network host -t ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-${DEVICE} --build-arg DEVICE=${DEVICE} --build-arg BASE_IMAGE=${BASE_IMAGE} .
1818
docker push ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-${DEVICE}
1919
done
2020

21-
docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu121 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
22-
docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu121 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:latest
21+
docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu124 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
22+
docker images -q ${REGISTRY}/tzrec-devel:${DOCKER_TAG}-cu124 | xargs -I {} docker tag {} ${REGISTRY}/tzrec-devel:latest
2323
docker push ${REGISTRY}/tzrec-devel:${DOCKER_TAG}
2424
docker push ${REGISTRY}/tzrec-devel:latest

scripts/ci_test.sh

-4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,4 @@
33
pip install -r requirements.txt
44
bash scripts/gen_proto.sh
55

6-
# just workaround for torch-tensorrt (dynamic shape) https://github.com/pytorch/TensorRT/pull/3289/files
7-
cp tzrec/acc/_aten_lowering_pass.py /opt/conda/lib/python3.11/site-packages/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
8-
cp tzrec/acc/_decompositions.py /opt/conda/lib/python3.11/site-packages/torch_tensorrt/dynamo/lowering/_decompositions.py
9-
106
MKL_THREADING_LAYER=GNU TORCH_DEVICE_BACKEND_AUTOLOAD=0 PYTHONPATH=. python tzrec/tests/run.py

scripts/pyre_check.py

+8
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@
2929
"Undefined attribute [16]: Module `pyarrow` has no attribute",
3030
"Undefined attribute [16]: Module `pyarrow.compute` has no attribute",
3131
"Undefined attribute [16]: Module `pyarrow.csv` has no attribute",
32+
# type-safety of torch.nn.Module instances
33+
# https://github.com/pytorch/pytorch/issues/81462
34+
# Call error [29]: `typing.Union[nn.modules.module.Module, torch._tensor.Tensor]` is
35+
# not a function.
36+
"Union[nn.modules.module.Module, torch._tensor.Tensor]",
37+
"Union[torch._tensor.Tensor, torch.nn.modules.module.Module]",
38+
"Union[torch._tensor.Tensor, nn.modules.module.Module]",
39+
"Union[Module, Tensor]",
3240
]
3341

3442
if __name__ == "__main__":

0 commit comments

Comments
 (0)