update to use s2s-ft v0.3; add cpu test

daden-ms · daden-ms · commit 8b6a2c47c74e · 2020-04-03T18:45:27.000Z
diff --git a/examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb b/examples/text_summarization/abstractive_summarization_unilm_cnndm.ipynb
@@ -60,14 +60,23 @@
     "import os\n",
     "import shutil\n",
     "from tempfile import TemporaryDirectory\n",
-    "import scrapbook as sb\n",
     "import pprint\n",
+    "import scrapbook as sb\n",
+    "import sys\n",
     "import time\n",
+    "import torch\n",
+    "\n",
+    "nlp_path = os.path.abspath(\"../../\")\n",
+    "if nlp_path not in sys.path:\n",
+    "    sys.path.insert(0, nlp_path)\n",
     "\n",
     "from utils_nlp.dataset.cnndm import CNNDMSummarizationDatasetOrg\n",
     "from utils_nlp.models.transformers.abstractive_summarization_seq2seq import S2SAbsSumProcessor, S2SAbstractiveSummarizer\n",
     "from utils_nlp.eval import compute_rouge_python\n",
     "\n",
+    "from utils_nlp.models.transformers.datasets import SummarizationDataset\n",
+    "from utils_nlp.dataset.cnndm import detokenize\n",
+    "\n",
     "start_time = time.time()"
    ]
   },
@@ -82,11 +91,14 @@
    "outputs": [],
    "source": [
     "# model parameters\n",
-    "MODEL_NAME = \"unilm-large-cased\"\n",
+    "MODEL_NAME = \"unilm-base-cased\"\n",
     "MAX_SEQ_LENGTH = 768\n",
     "MAX_SOURCE_SEQ_LENGTH = 640\n",
     "MAX_TARGET_SEQ_LENGTH = 128\n",
     "\n",
+    "# use 0 for CPU\n",
+    "NUM_GPUS =  torch.cuda.device_count()\n",
+    "\n",
     "# fine-tuning parameters\n",
     "TRAIN_PER_GPU_BATCH_SIZE = 1\n",
     "GRADIENT_ACCUMULATION_STEPS = 2\n",
@@ -101,6 +113,9 @@
     "    WARMUP_STEPS = 5\n",
     "    MAX_STEPS = 50\n",
     "    BEAM_SIZE = 3\n",
+    "    if NUM_GPUS == 0:\n",
+    "        TOP_N = 5\n",
+    "        MAX_STEPS = 10\n",
     "\n",
     "# inference parameters\n",
     "TEST_PER_GPU_BATCH_SIZE = 12\n",
@@ -220,6 +235,7 @@
    "source": [
     "abs_summarizer.fit(\n",
     "    train_dataset=train_dataset,\n",
+    "    num_gpus=NUM_GPUS,\n",
     "    per_gpu_batch_size=TRAIN_PER_GPU_BATCH_SIZE,\n",
     "    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
     "    learning_rate=LEARNING_RATE,\n",
@@ -240,11 +256,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "res = abs_summarizer.predict(\n",
+    "predictions = abs_summarizer.predict(\n",
     "    test_dataset=test_dataset,\n",
+    "    num_gpus=NUM_GPUS,\n",
     "    per_gpu_batch_size=TEST_PER_GPU_BATCH_SIZE,\n",
     "    beam_size=BEAM_SIZE,\n",
     "    forbid_ignore_word=FORBID_IGNORE_WORD,\n",
@@ -258,21 +277,106 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for r in res[:5]:\n",
+    "for r in predictions[:TOP_N]:\n",
     "    print(r)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_ds.get_source()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_ds.get_target()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions[0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "with open(OUTPUT_FILE, 'w', encoding=\"utf-8\") as f:\n",
-    "    for line in res:\n",
+    "    for line in predictions:\n",
     "        f.write(line + '\\n')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prediction on a single input sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source = \"\"\"\n",
+    "But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\n",
+    "\n",
+    "Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\n",
+    "The Trump administration plans to immediately turn back all asylum seekers and other foreigners attempting to enter the United States from Mexico illegally, saying the nation cannot risk allowing the coronavirus to spread through detention facilities and Border Patrol agents, four administration officials said.\n",
+    "The administration officials said the ports of entry would remain open to American citizens, green-card holders and foreigners with proper documentation. Some foreigners would be blocked, including Europeans currently subject to earlier travel restrictions imposed by the administration. The points of entry will also be open to commercial traffic.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "singel_test_ds = SummarizationDataset(\n",
+    "    None, source=[source], source_preprocessing=[detokenize],\n",
+    ")\n",
+    "single_test_dataset = processor.s2s_dataset_from_sum_ds(singel_test_ds, train_mode=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "single_prediction = abs_summarizer.predict(\n",
+    "    test_dataset=single_test_dataset,\n",
+    "    num_gpus=NUM_GPUS,\n",
+    "    per_gpu_batch_size=1,\n",
+    "    beam_size=BEAM_SIZE,\n",
+    "    forbid_ignore_word=FORBID_IGNORE_WORD,\n",
+    "    fp16=FP16\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "single_prediction[0]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -297,7 +401,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rouge_scores = compute_rouge_python(cand=res, ref=test_ds.get_target())\n",
+    "rouge_scores = compute_rouge_python(cand=predictions, ref=test_ds.get_target())\n",
     "pprint.pprint(rouge_scores)"
    ]
   },
@@ -358,7 +462,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(\"Total notebook runningn time {}\".format(time.time() - start_time))"
+    "print(\"Total notebook running time {}\".format(time.time() - start_time))"
    ]
   },
   {
@@ -375,8 +479,9 @@
   }
  ],
  "metadata": {
+  "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "nlp_gpu",
+   "display_name": "Python (nlp_gpu)",
    "language": "python",
    "name": "nlp_gpu"
   },
diff --git a/tests/integration/test_notebooks_unilm_abstractive_summarization.py b/tests/integration/test_notebooks_unilm_abstractive_summarization.py
@@ -5,6 +5,7 @@
 import pytest
 import scrapbook as sb
 from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK
+import torch
 
 ABS_TOL = 0.02
 
@@ -19,6 +20,7 @@ def test_unilm_abstractive_summarization(notebooks, tmp):
         kernel_name=KERNEL_NAME,
         parameters=dict(
             QUICK_RUN=True,
+            NUM_GPUS=torch.cuda.device_count(),
             TOP_N=100,
             WARMUP_STEPS=5,
             MAX_STEPS=50,
@@ -33,3 +35,30 @@ def test_unilm_abstractive_summarization(notebooks, tmp):
     assert pytest.approx(result["rouge_1_f_score"], 0.2, abs=ABS_TOL)
     assert pytest.approx(result["rouge_2_f_score"], 0.07, abs=ABS_TOL)
     assert pytest.approx(result["rouge_l_f_score"], 0.16, abs=ABS_TOL)
+
+@pytest.mark.cpu
+@pytest.mark.integration
+def test_unilm_abstractive_summarization(notebooks, tmp):
+    notebook_path = notebooks["unilm_abstractive_summarization"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        kernel_name=KERNEL_NAME,
+        parameters=dict(
+            QUICK_RUN=True,
+            NUM_GPUS=0,
+            TOP_N=2,
+            WARMUP_STEPS=5,
+            MAX_STEPS=50,
+            GRADIENT_ACCUMULATION_STEPS=1,
+            TEST_PER_GPU_BATCH_SIZE=2,
+            BEAM_SIZE=3,
+            MODEL_DIR=tmp,
+            RESULT_DIR=tmp,
+        ),
+    )
+    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
+    assert pytest.approx(result["rouge_1_f_score"], 0.1, abs=ABS_TOL)
+    assert pytest.approx(result["rouge_2_f_score"], 0.05, abs=ABS_TOL)
+    assert pytest.approx(result["rouge_l_f_score"], 0.1, abs=ABS_TOL)
+
diff --git a/tools/generate_conda_file.py b/tools/generate_conda_file.py
@@ -95,7 +95,7 @@
     "googledrivedownloader": "googledrivedownloader>=0.4",
     "methodtools": "methodtools",
     "s2s-ft": "-e git+https://github.com/microsoft/unilm.git"
-    "@s2s-ft.v0.0#egg=s2s-ft&subdirectory=s2s-ft",
+    "@s2s-ft.v0.3#egg=s2s-ft&subdirectory=s2s-ft",
     "requests": "requests==2.22.0",
     "requests-oauthlib": "requests-oauthlib==1.2.0",
     "regex": "regex==2020.2.20",
diff --git a/utils_nlp/dataset/cnndm.py b/utils_nlp/dataset/cnndm.py
@@ -165,7 +165,7 @@ def download(local_path=".data"):
         return output_dir
 
 
-def _detokenize(line):
+def detokenize(line):
     """
     Detokenizes the processed CNN/DM dataset to recover the original dataset,
     e.g. converts "-LRB-" back to "(" and "-RRB-" back to ")".
@@ -255,8 +255,8 @@ def CNNDMSummarizationDatasetOrg(
     dev_source_file = os.path.join(org_data_dir, "dev.article")
     dev_target_file = os.path.join(org_data_dir, "dev.summary")
 
-    source_preprocessing = [_detokenize]
-    target_preprocessing = [_detokenize]
+    source_preprocessing = [detokenize]
+    target_preprocessing = [detokenize]
 
     if return_iterable:
         train_dataset = IterableSummarizationDataset(
diff --git a/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py b/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
@@ -732,12 +732,12 @@ def fit(
         )
 
         if save_model_to_dir is not None and local_rank in [-1, 0]:
-            self.save_model(save_model_to_dir, global_step - 1, fp16)
+            self.save_model(save_model_to_dir, global_step, fp16)
 
         # release GPU memories
         self.model.cpu()
         torch.cuda.empty_cache()
-        return global_step - 1
+        return global_step
 
     def predict(
         self,