Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 8b6a2c4

Browse files
committed
update to use s2s-ft v0.3; add cpu test
1 parent 7ce3ada commit 8b6a2c4

File tree

5 files changed

+149
-15
lines changed

5 files changed

+149
-15
lines changed

examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb

+114-9
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,23 @@
6060
"import os\n",
6161
"import shutil\n",
6262
"from tempfile import TemporaryDirectory\n",
63-
"import scrapbook as sb\n",
6463
"import pprint\n",
64+
"import scrapbook as sb\n",
65+
"import sys\n",
6566
"import time\n",
67+
"import torch\n",
68+
"\n",
69+
"nlp_path = os.path.abspath(\"../../\")\n",
70+
"if nlp_path not in sys.path:\n",
71+
" sys.path.insert(0, nlp_path)\n",
6672
"\n",
6773
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\n",
6874
"from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
6975
"from utils_nlp.eval import compute_rouge_python\n",
7076
"\n",
77+
"from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
78+
"from utils_nlp.dataset.cnndm import detokenize\n",
79+
"\n",
7180
"start_time = time.time()"
7281
]
7382
},
@@ -82,11 +91,14 @@
8291
"outputs": [],
8392
"source": [
8493
"# model parameters\n",
85-
"MODEL_NAME = \"unilm-large-cased\"\n",
94+
"MODEL_NAME = \"unilm-base-cased\"\n",
8695
"MAX_SEQ_LENGTH = 768\n",
8796
"MAX_SOURCE_SEQ_LENGTH = 640\n",
8897
"MAX_TARGET_SEQ_LENGTH = 128\n",
8998
"\n",
99+
"# use 0 for CPU\n",
100+
"NUM_GPUS = torch.cuda.device_count()\n",
101+
"\n",
90102
"# fine-tuning parameters\n",
91103
"TRAIN_PER_GPU_BATCH_SIZE = 1\n",
92104
"GRADIENT_ACCUMULATION_STEPS = 2\n",
@@ -101,6 +113,9 @@
101113
" WARMUP_STEPS = 5\n",
102114
" MAX_STEPS = 50\n",
103115
" BEAM_SIZE = 3\n",
116+
" if NUM_GPUS == 0:\n",
117+
" TOP_N = 5\n",
118+
" MAX_STEPS = 10\n",
104119
"\n",
105120
"# inference parameters\n",
106121
"TEST_PER_GPU_BATCH_SIZE = 12\n",
@@ -220,6 +235,7 @@
220235
"source": [
221236
"abs_summarizer.fit(\n",
222237
" train_dataset=train_dataset,\n",
238+
" num_gpus=NUM_GPUS,\n",
223239
" per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\n",
224240
" gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
225241
" learning_rate=LEARNING_RATE,\n",
@@ -240,11 +256,14 @@
240256
{
241257
"cell_type": "code",
242258
"execution_count": null,
243-
"metadata": {},
259+
"metadata": {
260+
"scrolled": true
261+
},
244262
"outputs": [],
245263
"source": [
246-
"res = abs_summarizer.predict(\n",
264+
"predictions = abs_summarizer.predict(\n",
247265
" test_dataset=test_dataset,\n",
266+
" num_gpus=NUM_GPUS,\n",
248267
" per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n",
249268
" beam_size=BEAM_SIZE,\n",
250269
" forbid_ignore_word=FORBID_IGNORE_WORD,\n",
@@ -258,21 +277,106 @@
258277
"metadata": {},
259278
"outputs": [],
260279
"source": [
261-
"for r in res[:5]:\n",
280+
"for r in predictions[:TOP_N]:\n",
262281
" print(r)"
263282
]
264283
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"metadata": {},
288+
"outputs": [],
289+
"source": [
290+
"test_ds.get_source()[0]"
291+
]
292+
},
293+
{
294+
"cell_type": "code",
295+
"execution_count": null,
296+
"metadata": {},
297+
"outputs": [],
298+
"source": [
299+
"test_ds.get_target()[0]"
300+
]
301+
},
302+
{
303+
"cell_type": "code",
304+
"execution_count": null,
305+
"metadata": {},
306+
"outputs": [],
307+
"source": [
308+
"predictions[0]"
309+
]
310+
},
265311
{
266312
"cell_type": "code",
267313
"execution_count": null,
268314
"metadata": {},
269315
"outputs": [],
270316
"source": [
271317
"with open(OUTPUT_FILE, 'w', encoding=\"utf-8\") as f:\n",
272-
" for line in res:\n",
318+
" for line in predictions:\n",
273319
" f.write(line + '\\n')"
274320
]
275321
},
322+
{
323+
"cell_type": "markdown",
324+
"metadata": {},
325+
"source": [
326+
"## Prediction on a single input sample"
327+
]
328+
},
329+
{
330+
"cell_type": "code",
331+
"execution_count": null,
332+
"metadata": {},
333+
"outputs": [],
334+
"source": [
335+
"source = \"\"\"\n",
336+
"But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\n",
337+
"\n",
338+
"Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\n",
339+
"The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\n",
340+
"The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\"\"\""
341+
]
342+
},
343+
{
344+
"cell_type": "code",
345+
"execution_count": null,
346+
"metadata": {},
347+
"outputs": [],
348+
"source": [
349+
"singel_test_ds = SummarizationDataset(\n",
350+
" None, source=[source], source_preprocessing=[detokenize],\n",
351+
")\n",
352+
"single_test_dataset = processor.s2s_dataset_from_sum_ds(singel_test_ds, train_mode=False)"
353+
]
354+
},
355+
{
356+
"cell_type": "code",
357+
"execution_count": null,
358+
"metadata": {},
359+
"outputs": [],
360+
"source": [
361+
"single_prediction = abs_summarizer.predict(\n",
362+
" test_dataset=single_test_dataset,\n",
363+
" num_gpus=NUM_GPUS,\n",
364+
" per_gpu_batch_size=1,\n",
365+
" beam_size=BEAM_SIZE,\n",
366+
" forbid_ignore_word=FORBID_IGNORE_WORD,\n",
367+
" fp16=FP16\n",
368+
")"
369+
]
370+
},
371+
{
372+
"cell_type": "code",
373+
"execution_count": null,
374+
"metadata": {},
375+
"outputs": [],
376+
"source": [
377+
"single_prediction[0]"
378+
]
379+
},
276380
{
277381
"cell_type": "markdown",
278382
"metadata": {},
@@ -297,7 +401,7 @@
297401
"metadata": {},
298402
"outputs": [],
299403
"source": [
300-
"rouge_scores = compute_rouge_python(cand=res, ref=test_ds.get_target())\n",
404+
"rouge_scores = compute_rouge_python(cand=predictions, ref=test_ds.get_target())\n",
301405
"pprint.pprint(rouge_scores)"
302406
]
303407
},
@@ -358,7 +462,7 @@
358462
"metadata": {},
359463
"outputs": [],
360464
"source": [
361-
"print(\"Total notebook runningn time {}\".format(time.time() - start_time))"
465+
"print(\"Total notebook running time {}\".format(time.time() - start_time))"
362466
]
363467
},
364468
{
@@ -375,8 +479,9 @@
375479
}
376480
],
377481
"metadata": {
482+
"celltoolbar": "Tags",
378483
"kernelspec": {
379-
"display_name": "nlp_gpu",
484+
"display_name": "Python (nlp_gpu)",
380485
"language": "python",
381486
"name": "nlp_gpu"
382487
},

tests/integration/test_notebooks_unilm_abstractive_summarization.py

+29
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66
import scrapbook as sb
77
from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK
8+
import torch
89

910
ABS_TOL = 0.02
1011

@@ -19,6 +20,7 @@ def test_unilm_abstractive_summarization(notebooks, tmp):
1920
kernel_name=KERNEL_NAME,
2021
parameters=dict(
2122
QUICK_RUN=True,
23+
NUM_GPUS=torch.cuda.device_count(),
2224
TOP_N=100,
2325
WARMUP_STEPS=5,
2426
MAX_STEPS=50,
@@ -33,3 +35,30 @@ def test_unilm_abstractive_summarization(notebooks, tmp):
3335
assert pytest.approx(result["rouge_1_f_score"], 0.2, abs=ABS_TOL)
3436
assert pytest.approx(result["rouge_2_f_score"], 0.07, abs=ABS_TOL)
3537
assert pytest.approx(result["rouge_l_f_score"], 0.16, abs=ABS_TOL)
38+
39+
@pytest.mark.cpu
40+
@pytest.mark.integration
41+
def test_unilm_abstractive_summarization(notebooks, tmp):
42+
notebook_path = notebooks["unilm_abstractive_summarization"]
43+
pm.execute_notebook(
44+
notebook_path,
45+
OUTPUT_NOTEBOOK,
46+
kernel_name=KERNEL_NAME,
47+
parameters=dict(
48+
QUICK_RUN=True,
49+
NUM_GPUS=0,
50+
TOP_N=2,
51+
WARMUP_STEPS=5,
52+
MAX_STEPS=50,
53+
GRADIENT_ACCUMULATION_STEPS=1,
54+
TEST_PER_GPU_BATCH_SIZE=2,
55+
BEAM_SIZE=3,
56+
MODEL_DIR=tmp,
57+
RESULT_DIR=tmp,
58+
),
59+
)
60+
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
61+
assert pytest.approx(result["rouge_1_f_score"], 0.1, abs=ABS_TOL)
62+
assert pytest.approx(result["rouge_2_f_score"], 0.05, abs=ABS_TOL)
63+
assert pytest.approx(result["rouge_l_f_score"], 0.1, abs=ABS_TOL)
64+

tools/generate_conda_file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@
9595
"googledrivedownloader": "googledrivedownloader>=0.4",
9696
"methodtools": "methodtools",
9797
"s2s-ft": "-e git+https://github.com/microsoft/unilm.git"
98-
"@s2s-ft.v0.0#egg=s2s-ft&subdirectory=s2s-ft",
98+
"@s2s-ft.v0.3#egg=s2s-ft&subdirectory=s2s-ft",
9999
"requests": "requests==2.22.0",
100100
"requests-oauthlib": "requests-oauthlib==1.2.0",
101101
"regex": "regex==2020.2.20",

utils_nlp/dataset/cnndm.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def download(local_path=".data"):
165165
return output_dir
166166

167167

168-
def _detokenize(line):
168+
def detokenize(line):
169169
"""
170170
Detokenizes the processed CNN/DM dataset to recover the original dataset,
171171
e.g. converts "-LRB-" back to "(" and "-RRB-" back to ")".
@@ -255,8 +255,8 @@ def CNNDMSummarizationDatasetOrg(
255255
dev_source_file = os.path.join(org_data_dir, "dev.article")
256256
dev_target_file = os.path.join(org_data_dir, "dev.summary")
257257

258-
source_preprocessing = [_detokenize]
259-
target_preprocessing = [_detokenize]
258+
source_preprocessing = [detokenize]
259+
target_preprocessing = [detokenize]
260260

261261
if return_iterable:
262262
train_dataset = IterableSummarizationDataset(

utils_nlp/models/transformers/abstractive_summarization_seq2seq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -732,12 +732,12 @@ def fit(
732732
)
733733

734734
if save_model_to_dir is not None and local_rank in [-1, 0]:
735-
self.save_model(save_model_to_dir, global_step - 1, fp16)
735+
self.save_model(save_model_to_dir, global_step, fp16)
736736

737737
# release GPU memories
738738
self.model.cpu()
739739
torch.cuda.empty_cache()
740-
return global_step - 1
740+
return global_step
741741

742742
def predict(
743743
self,

0 commit comments

Comments
 (0)