PAIR-code
diff --git a/‎animated-transformer/package-lock.json‎
Lines changed: 8 additions & 1 deletion b/‎animated-transformer/package-lock.json‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎animated-transformer/package.json‎
Lines changed: 1 addition & 0 deletions b/‎animated-transformer/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎animated-transformer/src/app/web-colab/tiny-transformer-example/trainer-cell.worker.ts‎
Lines changed: 4 additions & 3 deletions b/‎animated-transformer/src/app/web-colab/tiny-transformer-example/trainer-cell.worker.ts‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎animated-transformer/src/lib/tokens/token_gemb.spec.ts‎
Lines changed: 35 additions & 2 deletions b/‎animated-transformer/src/lib/tokens/token_gemb.spec.ts‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎animated-transformer/src/lib/tokens/token_gemb.ts‎
Lines changed: 32 additions & 24 deletions b/‎animated-transformer/src/lib/tokens/token_gemb.ts‎
Lines changed: 32 additions & 24 deletions
diff --git a/‎animated-transformer/src/lib/trainer/basic_transformer_trainer.ts‎
Lines changed: 2 additions & 2 deletions b/‎animated-transformer/src/lib/trainer/basic_transformer_trainer.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎animated-transformer/src/lib/transformer/common_transformer.spec.ts‎
Lines changed: 86 additions & 0 deletions b/‎animated-transformer/src/lib/transformer/common_transformer.spec.ts‎
Lines changed: 86 additions & 0 deletions
@@ -26,6 +26,7 @@
     "@codemirror/language": "^6.9.0",
     "@tensorflow/tfjs": "^4.20.0",
     "@tensorflow/tfjs-vis": "^1.5.1",
+    "gpt-tokenizer": "2.8.1",
     "codemirror": "^6.0.1",
     "d3": "^7.8.0",
     "d3-color": "^3.1.0",
 
@@ -29,14 +29,15 @@ import {
 } from './ailab';
 import {
   computeTransformer,
-  transformerAccuracy,
   TransformerConfig,
-  lastTokenCrossEntropyLoss,
   TransformerModel,
   VarTransformerParams,
   initDecoderParams,
-  TransformerComputation,
 } from 'src/lib/transformer/transformer_gtensor';
+import {
+  transformerAccuracy,
+  lastTokenCrossEntropyLoss,
+} from 'src/lib/transformer/common_transformer';
 import {
   assignParams,
   deserializeParams,
 
@@ -21,6 +21,8 @@ import {
   strSeqPrepFn,
   embed,
   prepareBasicTaskTokenRep,
+  tokenizeAndMapToIdx,
+  mapToIdx,
   embedBatch,
   expectedOutputSeqPrepFn,
 } from '../tokens/token_gemb';
@@ -56,8 +58,9 @@ describe('token_gemb', () => {
     const tokenEmbedding = new GTensor(tf.tensor([aEmb, bEmb, padEmb]), ['tokenId', 'inputRep']);
 
     const seqsToEmbed = [['a', 'b', '[pad]', 'a'], ['a', 'b'], [], ['b'], ['a']];
+    const seqsIdxs = mapToIdx(tokenRep.tokenToIdx, seqsToEmbed);
 
-    const seqEmb = embedBatch(tokenRep.tokenToIdx, tokenEmbedding, seqsToEmbed, {
+    const seqEmb = embedBatch(tokenEmbedding, seqsIdxs, {
       paddingId: 2,
       padAt: 'start',
       dtype: 'int32',
@@ -82,8 +85,9 @@ describe('token_gemb', () => {
     const embeddings = new GTensor(tf.tensor([aEmb, bEmb, padEmb]), ['tokenId', 'inputRep']);
 
     const seqsToEmbed = [['a', 'b', '[pad]', 'a'], ['a', 'b'], [], ['b'], ['a']];
+    const seqsIdxs = mapToIdx(tokenRep.tokenToIdx, seqsToEmbed);
 
-    const seqEmb = embedBatch(tokenRep.tokenToIdx, embeddings, seqsToEmbed, {
+    const seqEmb = embedBatch(embeddings, seqsIdxs, {
       paddingId: 2,
       padAt: 'end',
       dtype: 'int32',
@@ -160,4 +164,33 @@ describe('token_gemb', () => {
     expect(targetTokensOneHot.tensor.arraySync()).toEqual(expectedOutputArr);
     expect(targetTokensOneHot.dimNames).toEqual(['batch', 'pos', 'tokenId'])
   });
+  it('Test tokenizeAndMapToIdx', () => {
+    // Mock a tokenizer for testing tokenizeAndMapToIdx.
+    function tokenize_fn_test(input: string): number[] {
+      let output: number[] = [];
+      for (let i = 0; i < input.length; i++) {
+        if (input[i] == 'a')
+          output = output.concat(0);
+        else
+          output = output.concat(1);
+      }
+      return output;
+    };
+
+    const seqsToEmbed = ['aba', 'ab', '', 'b', 'a'];
+    const seqsIdxs = tokenizeAndMapToIdx(tokenize_fn_test, seqsToEmbed);
+    const expectedIdxs =
+      [[0, 1, 0], [0, 1], [], [1], [0]];
+
+    expect(seqsIdxs).toEqual(expectedIdxs);
+  });
+  it('Test mapToIdx', () => {
+    const tokens = ['a', 'b', '[pad]'];
+    const tokenRep = prepareBasicTaskTokenRep(tokens);
+
+    const seqsToEmbed = [['a', 'b', '[pad]', 'a'], ['a', 'b'], [], ['b'], ['a']];
+    const seqsIdxs = mapToIdx(tokenRep.tokenToIdx, seqsToEmbed);
+    const expectedIdxs = [[0, 1, 2, 0], [0, 1], [], [1], [0]];
+    expect(seqsIdxs).toEqual(expectedIdxs);
+  });
 });
@@ -57,23 +57,30 @@ export function embed(
   return embeddedInput;
 }
 
-// TODO: consider supporting padding string[][] ?
-// pad(inputs: string[][], config: {
-//   paddingId: number;
-//   padAt: 'start' | 'end';
-//   dtype: tf.NumericDataType,
-// }) {
+// Maps tokens in string format to indexes.
+export function mapToIdx(
+  tokenToIdx: { [token: string]: number },
+  examples: string[][]
+): number[][] {
+  return examples.map((example) => example.map((s) => tokenToIdx[s]));
+}
 
-// }
+// TODO(@aliciafmachado): Merge this function with the one below
+// once we create a class to wrap the tokenization.
+export function tokenizeAndMapToIdx(
+  tokenize_fn: (input: string) => number[],
+  examples: string[]
+): number[][] {
+  return examples.map((example) => tokenize_fn(example));
+}
 
 // When batchSize is defined and batchSize > examples.length, then
 // padding-filled examples are added to the final output GTensor. When
 // batchSize < examples.length, examples is truncated to make the output
 // GTensor.
 export function embedBatch(
-  tokenToIdx: { [token: string]: number },
   embeddings: GTensor<'tokenId' | 'inputRep'>,
-  examples: string[][],
+  examples: number[][],
   config: {
     paddingId: number;
     padAt: 'start' | 'end';
@@ -91,21 +98,17 @@ export function embedBatch(
   let maxInputLength = 0;
   if (!config.maxInputLength) {
     examples.forEach((l) => (maxInputLength = Math.max(l.length, maxInputLength)));
-    examples.map((l) => l.map((s) => tokenToIdx[s]));
   } else {
     maxInputLength = config.maxInputLength;
   }
 
   examples.forEach((example) => {
     if (example.length >= maxInputLength) {
       const tensor = tf.tensor1d(
-        example.slice(0, maxInputLength).map((s) => tokenToIdx[s]),
+        example.slice(0, maxInputLength),
         config.dtype
       );
       inputEmbList.push(tensor);
-      // console.log(l)
-      // console.log(l.map(s => this.tokenToIdx[s]))
-      // console.log(tensor.dataSync())
     } else if (example.length === 0) {
       const tensor = tf.fill([maxInputLength], config.paddingId, config.dtype);
       inputEmbList.push(tensor);
@@ -116,7 +119,7 @@ export function embedBatch(
           : [[0, maxInputLength - example.length]];
       const tensor = tf.pad(
         tf.tensor1d(
-          example.map((s) => tokenToIdx[s]),
+          example,
           config.dtype
         ),
         paddingLocation,
@@ -152,10 +155,15 @@ export type BasicTaskTokenRep = {
   spaceToken: string;
   // tokens is all tokens, including mask, pod, eos, etc
   tokens: string[];
+  // remove below
   tokenToIdx: { [token: string]: number };
-  idxToOneHot : {[tokenIdx: number]: number[]};
+  idxToOneHot: { [tokenIdx: number]: number[] };
 };
 
+// TODO(@aliciafmachado): token wrap class with the tokenize and untokenize fn?
+// make basictasktokenrep minimal and then add a wrapper class that creates the tokenToIdx and idxToOneHot.
+// This interface would be compatible with a tokenizer straight out-of-the-box.
+
 // ----------------------------------------------------------------------------
 // Prepate the task representation in a vector space.
 // TODO: maybe this should be viewed as a task extension: i.e. Task --> Task
@@ -165,7 +173,7 @@ export function prepareBasicTaskTokenRep(baseVocab: string[]): BasicTaskTokenRep
   const padToken = '[PAD]';
   const eosToken = '[EOS]';
   const spaceToken = ' '
-  const vocab = [ ...baseVocab, maskToken, padToken, eosToken, spaceToken];
+  const vocab = [...baseVocab, maskToken, padToken, eosToken, spaceToken];
   const tokenToIdx: { [token: string]: number } = {};
   vocab.forEach((t, i) => (tokenToIdx[t] = i));
 
@@ -175,7 +183,7 @@ export function prepareBasicTaskTokenRep(baseVocab: string[]): BasicTaskTokenRep
   // );
 
   // TODO: Find a better place for the idxToOneHot lookup table
-  const idxToOneHot : {[tokenIdx: number]: number[] } = {};
+  const idxToOneHot: { [tokenIdx: number]: number[] } = {};
   const oneHotTokens = [tf.oneHot(tf.tensor1d(Object.values(tokenToIdx), 'int32'), baseVocab.length + 4).arraySync() as number[][]];
   Object.values(tokenToIdx).forEach((i) => (idxToOneHot[i] = oneHotTokens[0][i]));
   return {
@@ -217,10 +225,10 @@ export function strSeqPrepFn(
   options: { maxInputLength: number }
 ): GTensor<'batch' | 'pos' | 'inputRep'> {
   const padTokenId = model.config.tokenRep.tokenToIdx[model.config.tokenRep.padToken];
+  const inputSeqsInIdxs = mapToIdx(model.config.tokenRep.tokenToIdx, inputSeqs);
   const batchedInputEmb = embedBatch(
-    model.config.tokenRep.tokenToIdx,
     model.params.tokenEmbedding,
-    inputSeqs,
+    inputSeqsInIdxs,
     {
       paddingId: padTokenId,
       padAt: 'start',
@@ -282,21 +290,21 @@ export function singleNextTokenIdxOutputPrepFn(
 }
 
 // Returns the one Hot representation for each token of the expected output sequence for the provided input sequence
- export function expectedOutputSeqPrepFn(
+export function expectedOutputSeqPrepFn(
   model: { config: { tokenRep: BasicTaskTokenRep } },
   inputSeqs: string[][],
   expectedOutputs: string[][],
 ): GTensor<'batch' | 'pos' | 'tokenId'> {
   // Compute Token rep for inputSeq
   const batchInputs = inputSeqs.map((inputSeq) => inputSeq.map((token) => model.config.tokenRep.tokenToIdx[token]))
-   // Compute Token rep for inputSeq
+  // Compute Token rep for inputSeq
   const expectedOutputSeq = expectedOutputs.map((outputToken) => model.config.tokenRep.tokenToIdx[outputToken[0]])
   // Shift input sequences to the right and add the corresponding target in "expectedOutputs" at the end of each sequence
-  let shiftedInputs = batchInputs.map((x) => x.slice(1, ))
+  let shiftedInputs = batchInputs.map((x) => x.slice(1,))
   const expectedOutputSeqIdx = expectedOutputSeq.map((y, index) => shiftedInputs[index].concat(y))
   const expectedOutputSeqOneHot = expectedOutputSeqIdx.map((sample) => sample.map((tidx) => model.config.tokenRep.idxToOneHot[tidx]))
   // TODO: We should probably be using a lookup function and storing the one-hot for every token in the GPU as a constant.
-  return new GTensor(tf.tensor(expectedOutputSeqOneHot),['batch', 'pos', 'tokenId']);
+  return new GTensor(tf.tensor(expectedOutputSeqOneHot), ['batch', 'pos', 'tokenId']);
 }
 
 
 
@@ -25,7 +25,7 @@ import {
   splitGenerativeTaskTestSet,
 } from '../seqtasks/util';
 import { BasicTaskTokenRep, StrSeqPrepFn } from '../tokens/token_gemb';
-import { transformerAccuracy } from '../transformer/transformer_gtensor';
+import { transformerAccuracy, lastTokenCrossEntropyLoss } from '../transformer/common_transformer';
 import { TaskDatasetSplit, TrainState, TrainStateConfig } from './train_state';
 import { RandomStream, makeRandomStream } from '../random/random';
 // import { GTensorTree, GVariableTree } from 'src/lib/gtensor/gtensor_tree';
@@ -74,7 +74,7 @@ export function initTransformerTrainState(
     generator: RandomStream
   ): tf.Scalar {
     const decoderComputation = transformer.computeTransformer(model, inputs, generator);
-    const loss = transformer.lastTokenCrossEntropyLoss(model, decoderComputation, targets);
+    const loss = lastTokenCrossEntropyLoss(model, decoderComputation, targets);
     return loss as tf.Scalar;
   }
 
 
@@ -0,0 +1,86 @@
+/* Copyright 2023 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import { GTensor, makeTruncNormal } from '../gtensor/gtensor';
+import { causalMask } from './common_transformer';
+import * as tf from '@tensorflow/tfjs';
+import * as abtask from '../seqtasks/ab_task';
+import { embedBatch, mapToIdx, prepareBasicTaskTokenRep } from '../tokens/token_gemb';
+
+describe('Common Transformer util types and functions', () => {
+    it('AB task data prep', async () => {
+        const inputRep = 2;
+        const batchSize = 4;
+        const task = new abtask.AorBisMaxTask({
+            kind: 'AorBisMaxTask',
+            id: 'an A or B is Max task',
+            maxInputLen: 2,
+            maxOutputLen: 2,
+            genStateConfig: { seed: 0 },
+            // Create a tokenEmbedding that also has [MASC] token & [PAD] token.
+            // inputRepSize: inputRep,
+        });
+        const tokenRep = prepareBasicTaskTokenRep(task.baseVocab);
+        const padTokenId = tokenRep.tokenToIdx[tokenRep.padToken];
+        const embeddings = makeTruncNormal({
+            tokenId: tokenRep.tokens.length,
+            inputRep,
+        });
+
+        const examples = task.exampleIter.takeOutN(4);
+        const examplesIdxs = mapToIdx(tokenRep.tokenToIdx, examples.map((example) => example.input));
+        const maskIdx = tokenRep.tokenToIdx[tokenRep.maskToken];
+
+        const batchedInputEmb = embedBatch(
+            embeddings,
+            examplesIdxs.map((example) => example.concat(maskIdx)),
+            { paddingId: padTokenId, padAt: 'start', dtype: 'int32' },
+        );
+
+        expect(batchedInputEmb.gshape()).toEqual({
+            batch: batchSize,
+            // +1 for the appended [MASK] token to be predicted.
+            pos: task.config.maxInputLen + 1,
+            inputRep,
+        });
+    });
+
+    it('Compute masked self attention', () => {
+        const exampleAffinities = new GTensor(
+            tf.tensor([
+                [
+                    [
+                        [0, 0, 0],
+                        [0, 0, 0],
+                        [0, 0, 0],
+                    ],
+                ],
+            ]),
+            ['batch', 'heads', 'keyPos', 'queryPos'],
+        );
+        const masked = causalMask(exampleAffinities);
+
+        expect(masked.dimNames).toEqual(['batch', 'heads', 'keyPos', 'queryPos']);
+        tf.test_util.expectArraysClose(masked.tensor.arraySync(), [
+            [
+                [
+                    [1, 0, 0],
+                    [0.5, 0.5, 0],
+                    [0.33, 0.33, 0.33],
+                ],
+            ],
+        ]);
+    });
+});