highlight anomalies

vpj · vpj · commit c4a3b3cff41c · 2021-02-16T15:44:16.000+05:30
diff --git a/notebooks/highlight.ipynb b/notebooks/highlight.ipynb
diff --git a/python_autocomplete/dataset/bpe.py b/python_autocomplete/dataset/bpe.py
@@ -1,10 +1,11 @@
 from functools import lru_cache
 from heapq import heappush, heappop
-from typing import List, Tuple
+from typing import List
 
 from labml import lab, monit
 from labml.utils.cache import cache_set
-from python_autocomplete.dataset import Tokenizer, ID_CHARS
+from python_autocomplete.dataset import Tokenizer
+from python_autocomplete.dataset.break_words import SourceCodeTokenizer
 
 
 class BPE(Tokenizer):
@@ -142,93 +143,6 @@ def encode(self, word: str):
         return self.encoder.encode([self.char_stoi[c] for c in word if c in self.char_stoi])
 
 
-class WordTokenizer:
-    def collect_words(self, data: str):
-        raise NotImplementedError
-
-    def get_words(self) -> Tuple[List[str], List[int]]:
-        raise NotImplementedError
-
-    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
-        raise NotImplementedError
-
-
-class SourceCodeTokenizer(WordTokenizer):
-    def __init__(self):
-        self.words = {}
-
-    def add_word(self, word):
-        if not word:
-            return
-
-        if word not in self.words:
-            self.words[word] = 1
-        else:
-            self.words[word] += 1
-
-    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
-        last_idx = 0
-        is_id = False
-        res = []
-
-        for i, c in monit.enum('Collect words', data, is_silent=is_silent):
-            if c in ID_CHARS:
-                if not is_id:
-                    if last_idx < i:
-                        res.append(data[last_idx:i])
-                    last_idx = i
-                    is_id = True
-            else:
-                if is_id:
-                    if last_idx < i:
-                        res.append(data[last_idx:i])
-                    last_idx = i
-                    is_id = False
-
-        if last_idx < len(data):
-            res.append(data[last_idx:])
-
-        return res
-
-    def collect_words(self, data: str):
-        last_idx = 0
-        is_id = False
-
-        for i, c in monit.enum('Collect words', data):
-            if c in ID_CHARS:
-                if not is_id:
-                    self.add_word(data[last_idx:i])
-                    last_idx = i
-                    is_id = True
-            else:
-                if is_id:
-                    self.add_word(data[last_idx:i])
-                    last_idx = i
-                    is_id = False
-
-        self.add_word(data[last_idx:])
-
-    def get_words(self):
-        words_list = [(f, w) for w, f in self.words.items()]
-        words_list.sort(key=lambda x: -x[0])
-
-        return [w for _, w in words_list], [f for f, _ in words_list]
-
-
-class NoTokenizer(WordTokenizer):
-    def __init__(self):
-        self.data = ''
-
-    def collect_words(self, data):
-        self.data += data
-
-    def get_words(self):
-        return [self.data], [1]
-
-    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
-        return [data]
-
-
 class BPELearner:
     def __init__(self, words_list: List[str], word_freq: List[int]):
         self.words_list = words_list
diff --git a/python_autocomplete/dataset/break_words.py b/python_autocomplete/dataset/break_words.py
@@ -0,0 +1,91 @@
+from typing import List, Tuple
+
+from labml import monit
+from python_autocomplete.dataset import ID_CHARS
+
+
+class WordTokenizer:
+    def collect_words(self, data: str):
+        raise NotImplementedError
+
+    def get_words(self) -> Tuple[List[str], List[int]]:
+        raise NotImplementedError
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        raise NotImplementedError
+
+
+class SourceCodeTokenizer(WordTokenizer):
+    def __init__(self):
+        self.words = {}
+
+    def add_word(self, word):
+        if not word:
+            return
+
+        if word not in self.words:
+            self.words[word] = 1
+        else:
+            self.words[word] += 1
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        last_idx = 0
+        is_id = False
+        res = []
+
+        for i, c in monit.enum('Collect words', data, is_silent=is_silent):
+            if c in ID_CHARS:
+                if not is_id:
+                    if last_idx < i:
+                        res.append(data[last_idx:i])
+                    last_idx = i
+                    is_id = True
+            else:
+                if is_id:
+                    if last_idx < i:
+                        res.append(data[last_idx:i])
+                    last_idx = i
+                    is_id = False
+
+        if last_idx < len(data):
+            res.append(data[last_idx:])
+
+        return res
+
+    def collect_words(self, data: str):
+        last_idx = 0
+        is_id = False
+
+        for i, c in monit.enum('Collect words', data):
+            if c in ID_CHARS:
+                if not is_id:
+                    self.add_word(data[last_idx:i])
+                    last_idx = i
+                    is_id = True
+            else:
+                if is_id:
+                    self.add_word(data[last_idx:i])
+                    last_idx = i
+                    is_id = False
+
+        self.add_word(data[last_idx:])
+
+    def get_words(self):
+        words_list = [(f, w) for w, f in self.words.items()]
+        words_list.sort(key=lambda x: -x[0])
+
+        return [w for _, w in words_list], [f for f, _ in words_list]
+
+
+class NoTokenizer(WordTokenizer):
+    def __init__(self):
+        self.data = ''
+
+    def collect_words(self, data):
+        self.data += data
+
+    def get_words(self):
+        return [self.data], [1]
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        return [data]
diff --git a/python_autocomplete/evaluate/anomalies.py b/python_autocomplete/evaluate/anomalies.py
@@ -1,59 +1,79 @@
+import torch
+from torch import nn
+
 from labml import logger, lab, monit
 from labml.logger import Text, Style
-from python_autocomplete.evaluate import Predictor
-from python_autocomplete.evaluate.factory import get_predictor
+from labml_helpers.module import Module
+from python_autocomplete.dataset import Tokenizer
+from python_autocomplete.evaluate.factory import load_experiment
+from python_autocomplete.train import StateUpdater
+
 
+def anomalies(tokenizer: Tokenizer, text: str, model: Module, state_updater: StateUpdater, is_token_by_token: bool):
+    tokens = tokenizer.encode(text)
 
-def anomalies(predictor: Predictor, text: str):
     line_no = 1
-    logs = [(f"{line_no: 4d}: ", Text.meta), (text[0], Text.subtle)]
+    logs = [(f"{line_no: 4d}: ", Text.meta), (tokenizer.itos[tokens[0]], Style.bold)]
+
+    text = torch.tensor(tokens, dtype=torch.long, device=model.device)
+    prompt = text[:1].unsqueeze(-1)
+
+    state = None
+    softmax = nn.Softmax(-1)
 
-    i = 0
+    i = 1
 
     while i + 1 < len(text):
-        #             print(i, self.predictor.prompt)
-        preds, _ = predictor.get_predictions(text[:i + 1], None, calc_probs=True)
-        preds = preds[0, :]
-        c = text[i + 1]
-
-        if c == '\n':
-            logger.log(logs)
-            line_no += 1
-            logs = [(f"{line_no: 4d}: ", Text.meta)]
-        elif c == '\r':
-            continue
-        elif c not in predictor.tokenizer.stoi:
-            logs.append(c)
+        with torch.no_grad():
+            prediction, new_state = model(prompt, state)
+
+        state = state_updater(state, new_state)
+        prediction = softmax(prediction[-1, 0])
+
+        if is_token_by_token:
+            prompt = text[i: i + 1].unsqueeze(-1)
         else:
-            next_id = predictor.tokenizer.stoi[c]
-            prob = preds[next_id]
-            if prob > 0.9:
-                logs.append((c, [Style.bold, Text.success, Style.underline]))
-            elif prob > 0.75:
-                logs.append((c, [Text.success, Style.underline]))
-            elif prob > 0.2:
-                logs.append(c)
-            elif prob > 0.1:
-                logs.append((c, [Text.warning, Style.underline]))
-            elif prob > 0.01:
-                logs.append((c, [Style.bold, Text.warning, Style.underline]))
-            elif prob > 0.001:
-                logs.append((c, [Text.danger, Style.underline]))
+            prompt = text[:i + 1]
+            prompt = prompt[-512:].unsqueeze(-1)
+
+        token_str = tokenizer.itos[text[i]]
+        prob = prediction[text[i]].item()
+
+        for c in token_str:
+            if c == '\n':
+                logger.log(logs)
+                line_no += 1
+                logs = [(f"{line_no: 4d}: ", Text.meta)]
+            elif c == '\r':
+                continue
             else:
-                logs.append((c, [Style.bold, Text.danger, Style.underline]))
+                if prob > 0.9:
+                    logs.append((c, [Text.subtle, Style.underline]))
+                elif prob > 0.75:
+                    logs.append((c, [Text.success, Style.underline]))
+                elif prob > 0.2:
+                    logs.append(c)
+                elif prob > 0.1:
+                    logs.append((c, [Text.warning, Style.underline]))
+                elif prob > 0.01:
+                    logs.append((c, [Style.bold, Text.warning, Style.underline]))
+                elif prob > 0.001:
+                    logs.append((c, [Text.danger, Style.underline]))
+                else:
+                    logs.append((c, [Style.bold, Text.danger, Style.underline]))
 
         i += 1
 
     logger.log(logs)
 
 
 def main():
-    predictor = get_predictor()
+    conf = load_experiment()
 
     with open(str(lab.get_data_path() / 'sample.py'), 'r') as f:
         sample = f.read()
     with monit.section('Anomalies'):
-        anomalies(predictor, sample)
+        anomalies(conf.text.tokenizer, sample, conf.model, conf.state_updater, conf.is_token_by_token)
 
 
 if __name__ == '__main__':
diff --git a/python_autocomplete/evaluate/factory.py b/python_autocomplete/evaluate/factory.py
@@ -4,7 +4,7 @@
 from python_autocomplete.train import Configs
 
 
-def get_predictor() -> Predictor:
+def load_experiment() -> Configs:
     conf = Configs()
     experiment.evaluate()
 
@@ -29,7 +29,13 @@ def get_predictor() -> Predictor:
     experiment.load(run_uuid, checkpoint)
 
     experiment.start()
+
+    return conf
+
+
+def get_predictor() -> Predictor:
+    conf = load_experiment()
     conf.model.eval()
     return Predictor(conf.model, conf.text.tokenizer,
                      state_updater=conf.state_updater,
-                     is_token_by_token=conf.is_token_by_token)
+                     is_token_by_token=conf.is_token_by_token)