From ea6175218ccc6b83997f83adf23003f25ad7750c Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 16 Mar 2023 18:44:03 -0700
Subject: [PATCH 1/8] wip

---
 dbldatagen/text_randomstr.py | 868 +++++++++++++++++++++++++++++++++++
 1 file changed, 868 insertions(+)
 create mode 100644 dbldatagen/text_randomstr.py

diff --git a/dbldatagen/text_randomstr.py b/dbldatagen/text_randomstr.py
new file mode 100644
index 00000000..92571ddf
--- /dev/null
+++ b/dbldatagen/text_randomstr.py
@@ -0,0 +1,868 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file defines the RandomStr text generator
+"""
+
+import math
+import random
+
+import numpy as np
+import pandas as pd
+
+#: list of hex digits for template generation
+_HEX_LOWER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']
+
+#: list of upper case hex digits for template generation
+_HEX_UPPER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
+
+#: list of non-zero digits for template generation
+_DIGITS_NON_ZERO = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+#: list of digits for template generation
+_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+#: list of uppercase letters for template generation
+_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                  'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z']
+
+#: list of lowercase letters for template generation
+_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+                  'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+
+#: list of all letters uppercase and lowercase
+_LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER
+
+#: list of alphanumeric chars in lowercase
+_ALNUM_LOWER = _LETTERS_LOWER + _DIGITS_ZERO
+
+#: list of alphanumeric chars in uppercase
+_ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO
+
+""" words for ipsum lorem based text generation"""
+_WORDS_LOWER = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', 'sed', 'do',
+                'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', 'ut',
+                'enim', 'ad', 'minim', 'veniam', 'quis', 'nostrud', 'exercitation', 'ullamco', 'laboris',
+                'nisi', 'ut', 'aliquip', 'ex', 'ea', 'commodo', 'consequat', 'duis', 'aute', 'irure', 'dolor',
+                'in', 'reprehenderit', 'in', 'voluptate', 'velit', 'esse', 'cillum', 'dolore', 'eu', 'fugiat',
+                'nulla', 'pariatur', 'excepteur', 'sint', 'occaecat', 'cupidatat', 'non', 'proident', 'sunt',
+                'in', 'culpa', 'qui', 'officia', 'deserunt', 'mollit', 'anim', 'id', 'est', 'laborum']
+
+_WORDS_UPPER = ['LOREM', 'IPSUM', 'DOLOR', 'SIT', 'AMET', 'CONSECTETUR', 'ADIPISCING', 'ELIT', 'SED', 'DO',
+                'EIUSMOD', 'TEMPOR', 'INCIDIDUNT', 'UT', 'LABORE', 'ET', 'DOLORE', 'MAGNA', 'ALIQUA', 'UT',
+                'ENIM', 'AD', 'MINIM', 'VENIAM', 'QUIS', 'NOSTRUD', 'EXERCITATION', 'ULLAMCO', 'LABORIS',
+                'NISI', 'UT', 'ALIQUIP', 'EX', 'EA', 'COMMODO', 'CONSEQUAT', 'DUIS', 'AUTE', 'IRURE',
+                'DOLOR', 'IN', 'REPREHENDERIT', 'IN', 'VOLUPTATE', 'VELIT', 'ESSE', 'CILLUM', 'DOLORE',
+                'EU', 'FUGIAT', 'NULLA', 'PARIATUR', 'EXCEPTEUR', 'SINT', 'OCCAECAT', 'CUPIDATAT', 'NON',
+                'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST',
+                'LABORUM']
+
+
+class TextGenerator(object):
+    """ Base class for text generation classes
+
+    """
+
+    def __init__(self):
+        self._randomSeed = 42
+        self._rngInstance = None
+
+    def __repr__(self):
+        return f"TextGenerator(randomSeed={self._randomSeed})"
+
+    def __str__(self):
+        return f"TextGenerator(randomSeed={self._randomSeed})"
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self._randomSeed == other._randomSeed
+
+    def withRandomSeed(self, seed):
+        """ Set the random seed for the text generator
+
+        :param seed: seed value to set
+        :return: self
+        """
+        assert seed is None or type(seed) is int, "expecting an integer seed for Text Generator"
+        self._randomSeed = seed
+        return self
+
+    @property
+    def randomSeed(self):
+        """ Get random seed for text generator"""
+        return self._randomSeed
+
+    def getNPRandomGenerator(self, forceNewInstance=False):
+        """ Get numpy random number generator
+
+        :return: returns random number generator initialized from previously supplied random seed
+        """
+        assert self._randomSeed is None or type(self._randomSeed) in [int, np.int32, np.int64], \
+            f"`random_seed` must be int or int-like not {type(self._randomSeed)}"
+
+        if self._rngInstance is not None and not forceNewInstance:
+            return self._rngInstance
+
+        from numpy.random import default_rng
+        if self._randomSeed is not None and self._randomSeed not in (-1, -1.0):
+            rng = default_rng(seed=self._randomSeed)
+        else:
+            rng = default_rng()
+
+        if not forceNewInstance:
+            self._rngInstance = rng
+        return rng
+
+    @staticmethod
+    def compactNumpyTypeForValues(listValues):
+        """ determine smallest numpy type to represent values
+
+        :param listValues: list or np.ndarray of values to get np.dtype for
+        :return: np.dtype that is most compact representation for values provided
+        """
+        if type(listValues) is list:
+            max_value_represented = np.max(np.array(listValues).flatten())
+        else:
+            max_value_represented = np.max(listValues.flatten()) + 1
+        bits_required = math.ceil(math.log2(max_value_represented))
+
+        if bits_required <= 8:
+            # for small values, use byte representation
+            retval = np.dtype('B')
+        else:
+            # compute bytes required and raise to nearest power of 2
+            bytesRequired = int(math.ceil(bits_required / 8.0))
+            retval = np.dtype(f"u{bytesRequired}")
+        return retval
+
+    @staticmethod
+    def getAsTupleOrElse(v, defaultValue, valueName):
+        """ get value v as tuple or return default value
+
+            :param v: value to test
+            :param defaultValue: value to use as a default if value of `v` is None. Must be a tuple.
+            :param valueName: name of value for debugging and logging purposes
+            :returns: return `v` as tuple if not `None` or value of `default_v` if `v` is `None`. If `v` is a single
+                      value, returns the tuple (`v`, `v`)"""
+        assert v is None or type(v) is int or type(v) is tuple, f"param {valueName} must be an int, a tuple or None"
+        assert type(defaultValue) is tuple and len(defaultValue) == 2, "default value must be tuple"
+
+        if type(v) is int:
+            return v, v
+        elif type(v) is tuple:
+            assert len(v) == 2, "expecting tuple of length 2"
+            assert type(v[0]) is int and type(v[1]) is int, "expecting tuple with both elements as integers"
+            return v
+        else:
+            assert len(defaultValue) == 2, "must have list or iterable with lenght 2"
+            assert type(defaultValue[0]) is int and type(defaultValue[1]) is int, "all elements must be integers"
+
+        return defaultValue
+
+
+class RandomStr(TextGenerator):  # lgtm [py/missing-equals]
+    """This class handles the generation of random string text
+
+    This will generate random strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`
+
+    :param minLength:  Minimum length of string
+    :param maxLength:  Maximum length of string. If not specified, min and max length are the same
+    :param leadingAlpha:  If True, leading character will be in range a-zAA-Z
+    :param allUpper:  If True, any alpha chars will be uppercase
+    :param allLower:  If True, any alpha chars will be lowercase
+    :param allAlpha:  If True, all chars will be in theb range a-z, A-Z
+
+    This method will generate random strings varying in size from `minLength` to `maxLength`. The characters chosen
+    will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`, `allUpper`, `allLower`
+    or `allAlpha` parameters.
+
+    The modifiers can be combined - for example RandomStr(1, 5, leadingAlpha=True, allUpper=True)
+
+
+    :param escapeSpecialChars: By default special chars in the template have special meaning if unescaped
+                               If set to true, then the special meaning requires escape char ``\\``
+    :param extendedWordList: if provided, use specified word list instead of default word list
+
+
+    .. note::
+              If escape is used and`escapeSpecialChars` is False, then the following
+              char is assumed to have no special meaning.
+
+              If the `escapeSpecialChars` option is set to True, then the following char only has its special
+              meaning when preceded by an escape.
+
+              Some options must be always escaped for example ``\\0``, ``\\v``, ``\\n`` and ``\\w``.
+
+              A special case exists for ``\\v`` - if immediately followed by a digit 0 - 9, the underlying base value
+              is interpreted as an array of values and the nth element is retrieved where `n` is the digit specified.
+
+    In all other cases, the char itself is used.
+
+    The setting of the `escapeSpecialChars` determines how templates generate data.
+
+    If set to False, then the template ``r"\\dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"`` when applied
+    to the values zero to 999. This conforms to earlier implementations for backwards compatibility.
+
+    If set to True, then the template ``r"dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"``
+    when applied to the values zero to 999. This conforms to the preferred style going forward
+
+    """
+
+    def __init__(self, template, escapeSpecialChars=False, extendedWordList=None):
+        assert template is not None, "`template` must be specified"
+        super().__init__()
+
+        self._template = template
+        self._escapeSpecialMeaning = bool(escapeSpecialChars)
+        template_str0 = self._template
+        self._templates = [x.replace('$__sep__', '|') for x in template_str0.replace(r'\|', '$__sep__').split('|')]
+        self._wordList = np.array(extendedWordList if extendedWordList is not None else _WORDS_LOWER)
+        self._upperWordList = np.array([x.upper() for x in extendedWordList]
+                                       if extendedWordList is not None else _WORDS_UPPER)
+
+        self._np_digits_zero = np.array(_DIGITS_ZERO)
+        self._np_digits_non_zero = np.array(_DIGITS_NON_ZERO)
+        self._np_hex_upper = np.array(_HEX_UPPER)
+        self._np_hex_lower = np.array(_HEX_LOWER)
+        self._np_alnum_lower = np.array(_ALNUM_LOWER)
+        self._np_alnum_upper = np.array(_ALNUM_UPPER)
+        self._np_letters_lower = np.array(_LETTERS_LOWER)
+        self._np_letters_upper = np.array(_LETTERS_UPPER)
+        self._np_letters_all = np.array(_LETTERS_ALL)
+        self._lenWords = len(self._wordList)
+
+        # get the template metadata
+        template_info =  [self._prepareTemplateStrings(template, escapeSpecialMeaning=escapeSpecialChars)
+                                    for template in self._templates]
+        self._max_placeholders = max([ x[0] for x in template_info])
+        self._max_rnds_needed = max([ len(x[1]) for x in template_info])
+        self._placeholders_needed = [ x[0] for x in template_info]
+        self._template_rnd_bounds = [ x[1] for x in template_info]
+
+
+    def __repr__(self):
+        return f"TemplateGenerator(template='{self._template}')"
+
+    @property
+    def templates(self):
+        """ Get effective templates for text generator"""
+        return self._templates
+
+    def _getRandomInt(self, low, high=-1, rng=None):
+        """ generate random integer between low and high inclusive
+
+        :param low: low value, if no high value is specified, treat low value as high value and low of 0
+        :param high: high value for random number generation
+        :param rng: if provided, an instance of a numpy random number generator
+        :return: generated value
+        """
+        if high == -1:
+            high = low
+            low = 0
+
+        if rng is not None:
+            # numpy interval is different to ``randint``
+            return rng.integers(low, high + 1, dtype=np.int32)
+
+        # use standard random for now as it performs better when generating values one at a time
+        return random.randint(low, high)
+
+    def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False):
+        """ Prepare list of random numbers needed to generate template in vectorized form
+
+        :param genTemplate: template string to control text generation
+        :param escapeSpecialMeaning: if True, requires escape on special meaning chars.
+        :returns: tuple containing number of placeholders and vector of random values upper bounds
+
+        The first element of the tuple is the number of placeholders needed to populate the template
+
+        The second elememt is a vector of integer values which determine bounds for random number vector for
+        template generation
+
+        Each element of the vector will be used to generate a random number between 0 and the element inclusive,
+        which is then used to select words from wordlists etc for template expansion
+
+        `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing
+        for preferred new style template syntax. Specify as True to force escapes for special meanings,.
+
+        """
+        retval = []
+
+        escape = False
+        use_value = False
+        template_len = len(genTemplate)
+        num_placeholders = 0
+
+        # in the following code, the construct `(not escape) ^ self._escapeSpecialMeaning` means apply
+        # special meaning if either escape is not true or the option `self._escapeSpecialMeaning` is true.
+        # This corresponds to the logical xor operation
+        for i in range(0, template_len):
+            char = genTemplate[i]
+            following_char = genTemplate[i + 1] if i + 1 < template_len else None
+
+            if char == '\\':
+                escape = True
+            elif use_value and ('0' <= char <= '9'):
+                # val_index = int(char)
+                # retval.append(str(baseValue[val_index]))
+                num_placeholders += 1
+                use_value = False
+            elif char == 'x' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(16)
+                num_placeholders += 1
+                # used for retval.append(_HEX_LOWER[self._getRandomInt(0, 15, rndGenerator)])
+            elif char == 'X' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(16)
+                num_placeholders += 1
+                # retval.append(_HEX_UPPER[self._getRandomInt(0, 15, rndGenerator)])
+            elif char == 'd' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(10)
+                num_placeholders += 1
+                # retval.append(_DIGITS_ZERO[self._getRandomInt(0, 9, rndGenerator)])
+            elif char == 'D' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(9)
+                num_placeholders += 1
+                # retval.append(_DIGITS_NON_ZERO[self._getRandomInt(0, 8, rndGenerator)])
+            elif char == 'a' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(26)
+                num_placeholders += 1
+                # retval.append(_LETTERS_LOWER[self._getRandomInt(0, 25, rndGenerator)])
+            elif char == 'A' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(26)
+                num_placeholders += 1
+                # retval.append(_LETTERS_UPPER[self._getRandomInt(0, 25, rndGenerator)])
+            elif char == 'k' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(26)
+                num_placeholders += 1
+                # retval.append(_ALNUM_LOWER[self._getRandomInt(0, 35, rndGenerator)])
+            elif char == 'K' and (not escape) ^ escapeSpecialMeaning:
+                retval.append(36)
+                num_placeholders += 1
+                # retval.append(_ALNUM_UPPER[self._getRandomInt(0, 35, rndGenerator)])
+            elif char == 'n' and escape:
+                retval.append(256)
+                num_placeholders += 1
+                # retval.append(str(self._getRandomInt(0, 255, rndGenerator)))
+                escape = False
+            elif char == 'N' and escape:
+                retval.append(65536)
+                num_placeholders += 1
+                # retval.append(str(self._getRandomInt(0, 65535, rndGenerator)))
+                escape = False
+            elif char == 'W' and escape:
+                retval.append(self._lenWords)
+                num_placeholders += 1
+                # retval.append(self._upperWordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
+                escape = False
+            elif char == 'w' and escape:
+                retval.append(self._lenWords)
+                num_placeholders += 1
+                # retval.append(self._wordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
+                escape = False
+            elif char == 'v' and escape:
+                escape = False
+                if following_char is not None and ('0' <= following_char <= '9'):
+                    use_value = True
+                else:
+                    num_placeholders += 1
+                    # retval.append(str(baseValue))
+            elif char == 'V' and escape:
+                # retval.append(str(baseValue))
+                num_placeholders += 1
+                escape = False
+            else:
+                # retval.append(char)
+                num_placeholders += 1
+                escape = False
+
+        if use_value:
+            # retval.append(str(baseValue))
+            num_placeholders += 1
+
+        return num_placeholders, retval
+
+    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, escapeSpecialMeaning=False):
+        """ Vectorized implementation of template driven text substitution
+
+         Apply substitutions to placeholders using random numbers
+
+        :param baseValue: Pandas series or data frame of base value for applying template
+        :param genTemplate: template string to control text generation
+        :param placeholders: masked nparray of type np.object_ pre-allocated to hold strings emitted
+        :param rnds: masked numpy 2d array of random numbers needed for vectorized generation
+        :param escapeSpecialMeaning: if True, requires escape on special meaning chars.
+        :returns: placeholders
+
+        The vectorized implementation populates the placeholder Numpy array with the substituted values.
+
+        `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing
+        for preferred new style template syntax. Specify as True to force escapes for special meanings,.
+
+        """
+        assert baseValue.shape[0] == placeholders.shape[0]
+        assert baseValue.shape[0] == rnds.shape[0]
+
+        _cached_values = {}
+
+        def _get_values_as_np_array():
+            """Get baseValue which is pd.Series or Dataframe as a numpy array and cache it"""
+            if "np_values" not in _cached_values:
+                _cached_values["np_values"] = baseValue.to_numpy()
+
+            return _cached_values["np_values"]
+
+        def _get_values_subelement(elem):
+            """Get element from base values as np array and cache it"""
+            cache_key = f"v_{elem}"
+            if cache_key not in _cached_values:
+                np_values = _get_values_as_np_array()
+                #element_values = []
+                element_values = np.ndarray( np_values.shape[0], dtype=np_values.dtype)
+
+                for x in range(baseValue.shape[0]):
+                    #element_values.append(baseValue[x][elem])
+                    element_values[x] = baseValue[x][elem]
+                _cached_values[cache_key] = element_values
+
+            return _cached_values[cache_key]
+
+        escape = False
+        use_value = False
+        template_len = len(genTemplate)
+        num_placeholders = 0
+        rnd_offset = 0
+
+        # in the following code, the construct `(not escape) ^ self._escapeSpecialMeaning` means apply
+        # special meaning if either escape is not true or the option `self._escapeSpecialMeaning` is true.
+        # This corresponds to the logical xor operation
+        for i in range(0, template_len):
+            char = genTemplate[i]
+            following_char = genTemplate[i + 1] if i + 1 < template_len else None
+
+            if char == '\\':
+                escape = True
+            elif use_value and ('0' <= char <= '9'):
+                val_index = int(char)
+                placeholders[:, num_placeholders] = _get_values_subelement(val_index)
+                #placeholders[:, num_placeholders] = pd_base_values.apply(lambda x: str(x[val_index]))
+                num_placeholders += 1
+                use_value = False
+            elif char == 'x' and (not escape) ^ escapeSpecialMeaning:
+                # note vectorized lookup - `rnds[:, rnd_offset]` will get vertical column of
+                # random numbers from `rnds` 2d array
+                placeholders[:, num_placeholders] = self._np_hex_lower[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # used for retval.append(_HEX_LOWER[self._getRandomInt(0, 15, rndGenerator)])
+            elif char == 'X' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_hex_upper[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_HEX_UPPER[self._getRandomInt(0, 15, rndGenerator)])
+            elif char == 'd' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_digits_zero[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_DIGITS_ZERO[self._getRandomInt(0, 9, rndGenerator)])
+            elif char == 'D' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_digits_non_zero[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_DIGITS_NON_ZERO[self._getRandomInt(0, 8, rndGenerator)])
+            elif char == 'a' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_letters_lower[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_LETTERS_LOWER[self._getRandomInt(0, 25, rndGenerator)])
+            elif char == 'A' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_letters_upper[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_LETTERS_UPPER[self._getRandomInt(0, 25, rndGenerator)])
+            elif char == 'k' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_alnum_lower[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_ALNUM_LOWER[self._getRandomInt(0, 35, rndGenerator)])
+            elif char == 'K' and (not escape) ^ escapeSpecialMeaning:
+                placeholders[:, num_placeholders] = self._np_alnum_upper[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(_ALNUM_UPPER[self._getRandomInt(0, 35, rndGenerator)])
+            elif char == 'n' and escape:
+                placeholders[:, num_placeholders] = rnds[:, rnd_offset]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(str(self._getRandomInt(0, 255, rndGenerator)))
+                escape = False
+            elif char == 'N' and escape:
+                placeholders[:, num_placeholders] = rnds[:, rnd_offset]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(str(self._getRandomInt(0, 65535, rndGenerator)))
+                escape = False
+            elif char == 'W' and escape:
+                placeholders[:, num_placeholders] = self._upperWordList[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(self._upperWordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
+                escape = False
+            elif char == 'w' and escape:
+                placeholders[:, num_placeholders] = self._wordList[rnds[:, rnd_offset]]
+                num_placeholders += 1
+                rnd_offset = rnd_offset + 1
+                # retval.append(self._wordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
+                escape = False
+            elif char == 'v' and escape:
+                escape = False
+                if following_char is not None and ('0' <= following_char <= '9'):
+                    use_value = True
+                else:
+                    placeholders[:, num_placeholders] = _get_values_as_np_array()
+                    num_placeholders += 1
+                    # retval.append(str(baseValue))
+            elif char == 'V' and escape:
+                placeholders[:, num_placeholders] = _get_values_as_np_array()
+                # retval.append(str(baseValue))
+                num_placeholders += 1
+                escape = False
+            else:
+                placeholders[:, num_placeholders] = char
+                # retval.append(char)
+                num_placeholders += 1
+                escape = False
+
+        if use_value:
+            placeholders[:, num_placeholders] = _get_values_as_np_array()
+            # retval.append(str(baseValue))
+            num_placeholders += 1
+
+        return placeholders
+
+    def classicGenerateText(self, v):
+        """entry point to use for classic udfs"""
+
+        pdValues = pd.Series([v])
+        results = self.pandasGenerateText(pdValues)
+        return results[0]
+
+    def _prepare_random_bounds(self, v):
+        """
+        Prepare the random bounds for processing of the template expansion
+
+        For each template, we will have a vector of random numbers to generate for expanding the template
+
+        If we have multiple templates, there will be a separate vector of random numbers for each template
+
+
+        :param v: Pandas series of values passed as base values
+        :return: vector of templates chosen, template random bounds (1 for each substitution) and selected
+                 random numbers for each row (as numpy array)
+        """
+        # choose templates
+        num_templates = len(self.templates)
+        assert num_templates >= 1, "Expecting at least 1 template"
+
+        rng = self.getNPRandomGenerator()
+
+        if num_templates > 1:
+            # choose template at random from 0 .. num_templates
+            templates_chosen = rng.integers(np.full(v.size, num_templates))
+        else:
+            # always use template 0
+            templates_chosen = np.full(v.size, 0)
+
+        # populate template random numbers
+        template_rnd_bounds = np.full((v.size, self._max_rnds_needed), -1)
+        masked_template_bounds = np.ma.MaskedArray(template_rnd_bounds, mask=False)
+
+        for i in range(num_templates):
+            # assign the
+            len_bounds_i = len(self._template_rnd_bounds[i])
+            masked_template_bounds[templates_chosen.T == i, 0:len_bounds_i] = self._template_rnd_bounds[i]
+
+        masked_template_bounds[template_rnd_bounds == -1] = np.ma.masked
+
+        r_mask = masked_template_bounds.mask
+
+        template_rnds = template_rnd_bounds.copy()
+
+        template_rnds[~r_mask] = rng.integers(masked_template_bounds[~r_mask])
+
+        return templates_chosen, template_rnd_bounds, template_rnds
+
+    def pandasGenerateText(self, v):
+        """ entry point to use for pandas udfs
+
+        Implementation uses vectorized implementation of process
+
+        :param v: Pandas series of values passed as base values
+        :return: Pandas series of expanded templates
+
+        """
+        # placeholders is numpy array used to hold results
+        placeholders = np.full((v.shape[0], self._max_placeholders), '', dtype=np.object_)
+
+        # prepare template selections, bounds, rnd values to drive application of algorithm
+        template_choices, template_rnd_bounds, template_rnds = self._prepare_random_bounds(v)
+        template_choices_t = template_choices.T
+
+        # create masked arrays, with all elements initially masked
+        # as we substitute template expansion, we'll mask and unmask rows corresponding to each template
+        # calling the method to substitute the values on the masked placeholders
+        masked_placeholders = np.ma.MaskedArray(placeholders, mask=False)
+        masked_rnds = np.ma.MaskedArray(template_rnds, mask=False)
+        # masked_base_values = np.ma.MaskedArray(baseValues, mask=False)
+        masked_matrices = [masked_placeholders, masked_rnds]
+
+        # test logic for template expansion
+        for x in range(len(self._templates)):
+            masked_placeholders[template_choices_t != x, :] = np.ma.masked
+            masked_rnds[template_choices_t != x, :] = np.ma.masked
+            #masked_base_values[template_choices_t != x] = np.ma.masked
+
+            # harden mask, preventing modifications
+            for m in masked_matrices:
+                np.ma.harden_mask(m)
+
+            # expand values into placeholders
+            #self._applyTemplateStringsForTemplate(v.to_numpy(dtype=np.object_), #masked_base_values,
+            self._applyTemplateStringsForTemplate(v,
+                                                    #masked_base_values,
+                                                    self._templates[x],
+                                                    masked_placeholders,
+                                                    masked_rnds,
+                                                    escapeSpecialMeaning=self._escapeSpecialMeaning
+                                                    )
+
+            # soften and clear mask, allowing modifications
+            for m in masked_matrices:
+                np.ma.soften_mask(m)
+                m.mask = False
+
+        # join strings in placeholders
+        output = pd.Series(list(placeholders))
+        results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))
+
+        return results
+
+
+class ILText(TextGenerator):  # lgtm [py/missing-equals]
+    """ Class to generate Ipsum Lorem text paragraphs, words and sentences
+
+    :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range
+    :param sentences:  Number of sentences to generate. If tuple will generate random number in tuple range
+    :param words:  Number of words per sentence to generate. If tuple, will generate random number in tuple range
+
+    """
+
+    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None):
+        """
+        Initialize the ILText with text generation parameters
+        """
+        assert paragraphs is not None or sentences is not None or words is not None, \
+            "At least one of the params `paragraphs`, `sentences` or `words` must be specified"
+
+        super().__init__()
+
+        self.paragraphs = self.getAsTupleOrElse(paragraphs, (1, 1), "paragraphs")
+        self.words = self.getAsTupleOrElse(words, (2, 12), "words")
+        self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences")
+        self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER
+        self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]]
+
+        # values needed for the text generation
+        # numpy uses fixed sizes for strings , so compute whats needed
+        self._npWords = np.array(self.wordList)
+
+        self._processStats()
+        self._processWordList()
+
+    def _processStats(self):
+        """ Compute the stats needed for the text generation """
+
+        vals = [self.paragraphs, self.sentences, self.words]
+        self._textGenerationValues = np.array(vals, dtype=self.compactNumpyTypeForValues(vals))
+        self._minValues = self._textGenerationValues[:, 0]
+        self._maxValues = self._textGenerationValues[:, 1]
+
+        self._meanValues = np.mean(self._textGenerationValues, axis=1)
+
+        # we want to force wider spread of sentence length, so we're not simply computing the std_deviation
+        # - but computing a target std_dev that will spread sentence length
+        self._stdVals = self._meanValues / 2
+        self._stdVals2 = np.std(self._textGenerationValues, axis=1)
+
+    def _processWordList(self):
+        """ Set up the word lists"""
+        np_words = np.array(self.wordList, np.dtype(np.str_))
+        np_capitalized_words = np.char.capitalize(np_words[:])
+
+        all_words = np_words[:]
+
+        self._wordOffsetSize = all_words.size
+        self._sentenceEndOffset = all_words.size
+        self._paragraphEnd = self._sentenceEndOffset + 1
+        self._wordSpaceOffset = self._paragraphEnd + 1
+        self._emptyStringOffset = self._wordSpaceOffset + 1
+
+        punctuation = [". ", "\n\n", " ", ""]
+        all_words = np.concatenate((all_words, punctuation))
+
+        self._startOfCapitalsOffset = all_words.size
+        all_words = np.concatenate((all_words, np_capitalized_words, punctuation))
+
+        # for efficiency, we'll create list of words preceded by spaces - it will reduce memory consumption during join
+        # and array manipulation as we dont have to hold offset for space
+        self._startOfSpacedWordsOffset = all_words.size
+
+        np_spaced_words = np.array([" " + x for x in self.wordList], np.dtype(np.str_))
+        all_words = np.concatenate((all_words, np_spaced_words, punctuation))
+
+        # set up python list of all words so that we dont have to convert between numpy and python representations
+        self._allWordsSize = all_words.size
+        self._wordsAsPythonStrings = list([str(x) for x in all_words])
+
+        # get smallest type that can represent word offset
+        self._wordOffsetType = self.compactNumpyTypeForValues([all_words.size * 2 + 10])
+
+    def __repr__(self):
+        paras, sentences, words = self.paragraphs, self.sentences, self.words
+        wl = self.wordList.__repr__ if self.wordList is not None else "None"
+        return f"ILText(paragraphs={paras}, sentences={sentences}, words={words}, wordList={wl})"
+
+    def generateText(self, baseValues, rowCount=1):
+        """
+        generate text for seed based on configuration parameters.
+
+        As it uses numpy, repeatability is restricted depending on version of the runtime
+
+        :param baseValues: list or array-like list of baseValues
+        :param rowCount: number of rows
+        :returns: list or Pandas series of generated strings of same size as input seed
+        """
+        assert baseValues is not None, "`baseValues` param must be specified"
+        rng = self.getNPRandomGenerator(forceNewInstance=True)
+        word_offset_type = self._wordOffsetType
+
+        stats_shape = [rowCount, self.paragraphs[1], self.sentences[1], 3]
+
+        # determine counts of paragraphs, sentences and words
+        stats_array = np.empty(stats_shape, dtype=self._textGenerationValues.dtype)
+        para_stats_raw = np.round(rng.normal(self._meanValues, self._stdVals2, size=stats_shape))
+        para_stats = np.clip(para_stats_raw, self._minValues, self._maxValues, out=stats_array)
+
+        # replicate paragraphs and sentences from first row of each paragraph through other elememnts
+        # this is used to efficiently create mask for manipulating word offsets
+        # after this every row will contain :
+        # number of good paragraphs, number of good sentences, number of number of good words
+        # for current sentence in current paragraph in outer rows collection
+        para_stats[:, :, :, 0] = para_stats[:, :, 0, 0, np.newaxis]
+        para_stats[:, :, :, 1] = para_stats[:, :, 0, 1, np.newaxis]
+
+        # set up shape for arrays used to process word offsets
+        # this will be masked so that we don't waste resources processing invalid paragraphs, sentences and words
+        output_shape = (rowCount, self.paragraphs[1], self.sentences[1], self.words[1])
+
+        # compute the masks for paragraphs, sentences, and words
+
+        # get the set of indices for shape  - r = rows, p = paragraphs, s = sentences, w = words
+        # the indices will produce a set of rows of values for each dimension
+        # the mask is then produced by iterating comparing index with good value
+        # for example - if number of good words is 3 and sentence is index array of [0, 1, 2, 3, 4, 5 ,6]
+        # then mask is produced via conditions indices <= good_word
+        # note value of True means masked for numpy
+        r, p, s, w = np.indices(output_shape)
+
+        good_words = para_stats[:, :, :, 2]
+        good_paragraphs = para_stats[:, :, :, 0]
+        good_sentences = para_stats[:, :, :, 1]
+
+        # build masks in each dimension and `or` them together
+        words_mask = (w.T >= good_words.T).T
+        para_mask = (p.T >= good_paragraphs.T).T
+        sentences_mask = (s.T >= good_sentences.T).T
+        final_mask = words_mask | para_mask | sentences_mask
+
+        word_offsets = np.full(output_shape, dtype=word_offset_type, fill_value=self._emptyStringOffset)
+        masked_offsets = np.ma.MaskedArray(word_offsets, mask=final_mask)
+
+        # note numpy random differs from standard random in that it never produces upper bound
+        masked_offsets[~masked_offsets.mask] = rng.integers(self._wordOffsetSize,
+                                                            size=output_shape,
+                                                            dtype=self._wordOffsetType)[~masked_offsets.mask]
+
+        # hardening a mask prevents masked values from being changed
+        np.ma.harden_mask(masked_offsets)
+        masked_offsets[:, :, :, 0] = masked_offsets[:, :, :, 0] + self._startOfCapitalsOffset
+        masked_offsets[:, :, :, 1:] = masked_offsets[:, :, :, 1:] + self._startOfSpacedWordsOffset
+        np.ma.soften_mask(masked_offsets)
+
+        # add period to every sentence
+        # we'll replicate column 0 in order to preserve the mask and fill unmasked entries
+        # with the sentence end offset
+        new_word_offsets = masked_offsets[:, :, :, 0][:]
+        new_col = new_word_offsets[:, :, :, np.newaxis]
+        terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3)
+        new_column = terminated_word_offsets[:, :, :, -1]
+        new_column[~new_column.mask] = self._sentenceEndOffset
+
+        # reshape to paragraphs
+        shape = terminated_word_offsets.shape
+        paragraph_offsets = terminated_word_offsets.reshape((rowCount, shape[1], shape[2] * shape[3]))
+
+        if self.paragraphs[1] > 1:
+            # add paragraph terminator to every paragraph
+            # we'll take a copy of the first column so as to preserve masking
+            # i.e if first word of first sentence is masked, then end marker position should be masked
+            new_word_offsets = paragraph_offsets[:, :, 0][:]
+            new_col = new_word_offsets[:, :, np.newaxis]
+            terminated_paragraph_offsets = np.ma.concatenate((paragraph_offsets, new_col), axis=2)
+
+            # set the paragraph end marker on all paragraphs except last
+            # new_masked_elements = terminated_paragraph_offsets[:,:,-1]
+            new_column = terminated_paragraph_offsets[:, :, -1]
+            new_column[~new_column.mask] = self._paragraphEnd
+        else:
+            terminated_paragraph_offsets = paragraph_offsets
+
+        # reshape to rows containing word offset sequences. We'll join using pandas apply to avoid
+        # memory issues with fixed strings in numpy
+        shape = terminated_paragraph_offsets.shape
+        terminated_paragraph_offsets = terminated_paragraph_offsets.reshape((rowCount, shape[1] * shape[2]))
+
+        final_data = terminated_paragraph_offsets.filled(fill_value=self._emptyStringOffset)
+
+        # its faster to manipulate text in data frames as numpy strings are fixed length
+        all_python_words = self._wordsAsPythonStrings
+
+        base_results = pd.DataFrame(final_data)
+
+        # build our lambda expression, copying point to word list locally for efficiency
+        empty_string_offsets = [self._emptyStringOffset, self._emptyStringOffset + self._startOfSpacedWordsOffset]
+        mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x if x1 not in empty_string_offsets])).strip()
+        # mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x ]))
+
+        # ... and execute it
+        results = base_results.apply(mk_str_fn, axis=1)
+        return results
+
+    def classicGenerateText(self, v):
+        """
+        classic udf entry point for text generation
+
+        :param v: base value to control generation of random numbers
+        """
+        return self.generateText([v], 1)[0]
+
+    def pandasGenerateText(self, v):
+        """
+        pandas udf entry point for text generation
+
+        :param v: pandas series of base values for random text generation
+        :returns: Pandas series of generated strings
+        """
+        rows = v.to_numpy()
+        results = self.generateText(rows, rows.size)
+        return pd.Series(results)

From 41298b8a301427dc724f26aa6f740ee2636cd738 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Wed, 29 Mar 2023 10:58:39 -0700
Subject: [PATCH 2/8] wip

---
 dbldatagen/__init__.py                        |   3 +-
 dbldatagen/column_generation_spec.py          |   5 +-
 ...xt_randomstr.py => text_generatestring.py} | 283 +++++-------------
 dbldatagen/text_generators.py                 |  11 +
 tests/test_text_generatestring.py             |  75 +++++
 5 files changed, 172 insertions(+), 205 deletions(-)
 rename dbldatagen/{text_randomstr.py => text_generatestring.py} (73%)
 create mode 100644 tests/test_text_generatestring.py

diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
index cc0e8f5d..8d9937f3 100644
--- a/dbldatagen/__init__.py
+++ b/dbldatagen/__init__.py
@@ -40,11 +40,12 @@
 from .spark_singleton import SparkSingleton
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
+from .text_generatestring import GenerateString
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins"
+           "text_generator_plugins", "text_generatestring"
            ]
 
 
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
index 370c2de7..b1684f7e 100644
--- a/dbldatagen/column_generation_spec.py
+++ b/dbldatagen/column_generation_spec.py
@@ -1075,7 +1075,7 @@ def _applyPrefixSuffixExpressions(self, cprefix, csuffix, new_def):
             new_def = concat(new_def.astype(IntegerType()), lit(text_separator), lit(csuffix))
         return new_def
 
-    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
+    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True):
         """Apply text generation expression to column expression
 
         :param new_def : column definition being created
@@ -1086,6 +1086,9 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
         # while it seems like this could use a shared instance, this does not work if initialized
         # in a class method
         tg = self.textGenerator
+
+        new_def = tg.prepareBaseValue(new_def)
+
         if use_pandas_optimizations:
             self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`")
             u_value_from_generator = pandas_udf(tg.pandasGenerateText,
diff --git a/dbldatagen/text_randomstr.py b/dbldatagen/text_generatestring.py
similarity index 73%
rename from dbldatagen/text_randomstr.py
rename to dbldatagen/text_generatestring.py
index 92571ddf..238ddf0c 100644
--- a/dbldatagen/text_randomstr.py
+++ b/dbldatagen/text_generatestring.py
@@ -12,242 +12,119 @@
 import numpy as np
 import pandas as pd
 
-#: list of hex digits for template generation
-_HEX_LOWER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']
+from .text_generators import TextGenerator
+from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL
 
-#: list of upper case hex digits for template generation
-_HEX_UPPER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
+import pyspark.sql.functions as F
 
-#: list of non-zero digits for template generation
-_DIGITS_NON_ZERO = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
+class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
+    """This class handles the generation of string text of specified length drawn from alphanumeric characters.
 
-#: list of digits for template generation
-_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+    The set of chars to be used can be modified based on the parameters
 
-#: list of uppercase letters for template generation
-_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
-                  'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z']
+    This will generate deterministic strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`, or from a
+    custom character range if specified.
 
-#: list of lowercase letters for template generation
-_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
-                  'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+    :param length: length of string. Can be integer, or tuple (min, max)
+    :param leadingAlpha:  If True, leading character will be in range a-zAA-Z
+    :param allUpper:  If True, any alpha chars will be uppercase
+    :param allLower:  If True, any alpha chars will be lowercase
+    :param allAlpha:  If True, all chars will be non numeric
+    :param customChars:  If supplied, specifies a list of chars to use, or string of chars to use.
 
-#: list of all letters uppercase and lowercase
-_LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER
+    This method will generate deterministic strings varying in size from `minLength` to `maxLength`.
+    The characters chosen will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`,
+    `allUpper`, `allLower`, `allAlpha` or `customChars` parameters.
 
-#: list of alphanumeric chars in lowercase
-_ALNUM_LOWER = _LETTERS_LOWER + _DIGITS_ZERO
+    The modifiers can be combined - for example GenerateString(1, 5, leadingAlpha=True, allUpper=True)
 
-#: list of alphanumeric chars in uppercase
-_ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO
+    When the length is specified to be a tuple, it wll generate variable length strings of lengths from the lower bound
+    to the upper bound inclusive.
 
-""" words for ipsum lorem based text generation"""
-_WORDS_LOWER = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', 'sed', 'do',
-                'eiusmod', 'tempor', 'incididunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliqua', 'ut',
-                'enim', 'ad', 'minim', 'veniam', 'quis', 'nostrud', 'exercitation', 'ullamco', 'laboris',
-                'nisi', 'ut', 'aliquip', 'ex', 'ea', 'commodo', 'consequat', 'duis', 'aute', 'irure', 'dolor',
-                'in', 'reprehenderit', 'in', 'voluptate', 'velit', 'esse', 'cillum', 'dolore', 'eu', 'fugiat',
-                'nulla', 'pariatur', 'excepteur', 'sint', 'occaecat', 'cupidatat', 'non', 'proident', 'sunt',
-                'in', 'culpa', 'qui', 'officia', 'deserunt', 'mollit', 'anim', 'id', 'est', 'laborum']
+    The strings are generated deterministically so that they can be used for predictable primary and foreign keys.
 
-_WORDS_UPPER = ['LOREM', 'IPSUM', 'DOLOR', 'SIT', 'AMET', 'CONSECTETUR', 'ADIPISCING', 'ELIT', 'SED', 'DO',
-                'EIUSMOD', 'TEMPOR', 'INCIDIDUNT', 'UT', 'LABORE', 'ET', 'DOLORE', 'MAGNA', 'ALIQUA', 'UT',
-                'ENIM', 'AD', 'MINIM', 'VENIAM', 'QUIS', 'NOSTRUD', 'EXERCITATION', 'ULLAMCO', 'LABORIS',
-                'NISI', 'UT', 'ALIQUIP', 'EX', 'EA', 'COMMODO', 'CONSEQUAT', 'DUIS', 'AUTE', 'IRURE',
-                'DOLOR', 'IN', 'REPREHENDERIT', 'IN', 'VOLUPTATE', 'VELIT', 'ESSE', 'CILLUM', 'DOLORE',
-                'EU', 'FUGIAT', 'NULLA', 'PARIATUR', 'EXCEPTEUR', 'SINT', 'OCCAECAT', 'CUPIDATAT', 'NON',
-                'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST',
-                'LABORUM']
+    If the column definition that includes this specifies `random` then the string generation will be determined by a
+    seeded random number according to the rules for random numbers and random seeds used in other columns
 
+    If random is false, then the string will be generated from a pseudo random sequence generated purely from the
+    SQL hash of the `baseColumns`
 
-class TextGenerator(object):
-    """ Base class for text generation classes
+    .. note::
+       If customChars are specified, then the flag `allAlpha` will only remove digits.
 
     """
 
-    def __init__(self):
-        self._randomSeed = 42
-        self._rngInstance = None
-
-    def __repr__(self):
-        return f"TextGenerator(randomSeed={self._randomSeed})"
-
-    def __str__(self):
-        return f"TextGenerator(randomSeed={self._randomSeed})"
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self._randomSeed == other._randomSeed
-
-    def withRandomSeed(self, seed):
-        """ Set the random seed for the text generator
-
-        :param seed: seed value to set
-        :return: self
-        """
-        assert seed is None or type(seed) is int, "expecting an integer seed for Text Generator"
-        self._randomSeed = seed
-        return self
+    def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
+        super().__init__()
 
-    @property
-    def randomSeed(self):
-        """ Get random seed for text generator"""
-        return self._randomSeed
+        print("testing")
 
-    def getNPRandomGenerator(self, forceNewInstance=False):
-        """ Get numpy random number generator
+        assert customChars is None or isinstance(customChars, list) or isinstance(customChars, str),  \
+               "`customChars` should be list of characters or string containing custom chars"
 
-        :return: returns random number generator initialized from previously supplied random seed
-        """
-        assert self._randomSeed is None or type(self._randomSeed) in [int, np.int32, np.int64], \
-            f"`random_seed` must be int or int-like not {type(self._randomSeed)}"
+        assert not allUpper or not allLower, "allUpper and allLower cannot both be True"
 
-        if self._rngInstance is not None and not forceNewInstance:
-            return self._rngInstance
+        if isinstance(customChars, str):
+            assert len(customChars) > 0, "string of customChars must be non-empty"
+        elif isinstance(customChars, list):
+            assert all(isinstance(c, str) for c in customChars)
+            assert len(customChars) > 0, "list of customChars must be non-empty"
 
-        from numpy.random import default_rng
-        if self._randomSeed is not None and self._randomSeed not in (-1, -1.0):
-            rng = default_rng(seed=self._randomSeed)
+        # determine base alphabet
+        if isinstance(customChars, list):
+            charAlphabet = set("".join(customChars))
+        elif isinstance(customChars, str):
+            charAlphabet = set(customChars)
         else:
-            rng = default_rng()
+            charAlphabet = set(_LETTERS_ALL).union(set(_DIGITS_ZERO))
 
-        if not forceNewInstance:
-            self._rngInstance = rng
-        return rng
+        if allLower:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_UPPER))
+        elif allUpper:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_LOWER))
 
-    @staticmethod
-    def compactNumpyTypeForValues(listValues):
-        """ determine smallest numpy type to represent values
+        if allAlpha:
+            charAlphabet = charAlphabet.difference(set(_DIGITS_ZERO))
 
-        :param listValues: list or np.ndarray of values to get np.dtype for
-        :return: np.dtype that is most compact representation for values provided
-        """
-        if type(listValues) is list:
-            max_value_represented = np.max(np.array(listValues).flatten())
-        else:
-            max_value_represented = np.max(listValues.flatten()) + 1
-        bits_required = math.ceil(math.log2(max_value_represented))
+        self._charAlphabet = np.array(list(charAlphabet))
 
-        if bits_required <= 8:
-            # for small values, use byte representation
-            retval = np.dtype('B')
+        if leadingAlpha:
+            self._firstCharAlphabet = np.array(list(charAlphabet.difference(set(_DIGITS_ZERO))))
         else:
-            # compute bytes required and raise to nearest power of 2
-            bytesRequired = int(math.ceil(bits_required / 8.0))
-            retval = np.dtype(f"u{bytesRequired}")
-        return retval
-
-    @staticmethod
-    def getAsTupleOrElse(v, defaultValue, valueName):
-        """ get value v as tuple or return default value
-
-            :param v: value to test
-            :param defaultValue: value to use as a default if value of `v` is None. Must be a tuple.
-            :param valueName: name of value for debugging and logging purposes
-            :returns: return `v` as tuple if not `None` or value of `default_v` if `v` is `None`. If `v` is a single
-                      value, returns the tuple (`v`, `v`)"""
-        assert v is None or type(v) is int or type(v) is tuple, f"param {valueName} must be an int, a tuple or None"
-        assert type(defaultValue) is tuple and len(defaultValue) == 2, "default value must be tuple"
-
-        if type(v) is int:
-            return v, v
-        elif type(v) is tuple:
-            assert len(v) == 2, "expecting tuple of length 2"
-            assert type(v[0]) is int and type(v[1]) is int, "expecting tuple with both elements as integers"
-            return v
+            self._firstCharAlphabet = self._charAlphabet
+
+        # compute string lengths
+        if isinstance(length, int):
+            self._minLength, self._maxLength = length, length
+        elif isinstance(length, tuple):
+            assert len(length) == 2, "only 2 elements can be specified if length is a tuple"
+            assert all(isinstance(el, int) for el in length)
+            self._minLength, self._maxLength = length
         else:
-            assert len(defaultValue) == 2, "must have list or iterable with lenght 2"
-            assert type(defaultValue[0]) is int and type(defaultValue[1]) is int, "all elements must be integers"
-
-        return defaultValue
+            raise ValueError("`length` must be an integer or a tuple of two integers")
 
+        # compute bounds for generated strings
+        bounds = []
+        bounds.append( len(self._firstCharAlphabet))
+        for ix in range(1, self._maxLength):
+            bounds.append(len(self._charAlphabet))
 
-class RandomStr(TextGenerator):  # lgtm [py/missing-equals]
-    """This class handles the generation of random string text
-
-    This will generate random strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`
-
-    :param minLength:  Minimum length of string
-    :param maxLength:  Maximum length of string. If not specified, min and max length are the same
-    :param leadingAlpha:  If True, leading character will be in range a-zAA-Z
-    :param allUpper:  If True, any alpha chars will be uppercase
-    :param allLower:  If True, any alpha chars will be lowercase
-    :param allAlpha:  If True, all chars will be in theb range a-z, A-Z
+        self._bounds = bounds
 
-    This method will generate random strings varying in size from `minLength` to `maxLength`. The characters chosen
-    will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`, `allUpper`, `allLower`
-    or `allAlpha` parameters.
-
-    The modifiers can be combined - for example RandomStr(1, 5, leadingAlpha=True, allUpper=True)
-
-
-    :param escapeSpecialChars: By default special chars in the template have special meaning if unescaped
-                               If set to true, then the special meaning requires escape char ``\\``
-    :param extendedWordList: if provided, use specified word list instead of default word list
-
-
-    .. note::
-              If escape is used and`escapeSpecialChars` is False, then the following
-              char is assumed to have no special meaning.
-
-              If the `escapeSpecialChars` option is set to True, then the following char only has its special
-              meaning when preceded by an escape.
-
-              Some options must be always escaped for example ``\\0``, ``\\v``, ``\\n`` and ``\\w``.
-
-              A special case exists for ``\\v`` - if immediately followed by a digit 0 - 9, the underlying base value
-              is interpreted as an array of values and the nth element is retrieved where `n` is the digit specified.
-
-    In all other cases, the char itself is used.
-
-    The setting of the `escapeSpecialChars` determines how templates generate data.
-
-    If set to False, then the template ``r"\\dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"`` when applied
-    to the values zero to 999. This conforms to earlier implementations for backwards compatibility.
-
-    If set to True, then the template ``r"dr_\\v"`` will generate the values ``"dr_0"`` ... ``"dr_999"``
-    when applied to the values zero to 999. This conforms to the preferred style going forward
-
-    """
+    def __repr__(self):
+        return f"GenerateString(length={length}, leadingAlpha={leadingAlpha})"
 
-    def __init__(self, template, escapeSpecialChars=False, extendedWordList=None):
-        assert template is not None, "`template` must be specified"
-        super().__init__()
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
 
-        self._template = template
-        self._escapeSpecialMeaning = bool(escapeSpecialChars)
-        template_str0 = self._template
-        self._templates = [x.replace('$__sep__', '|') for x in template_str0.replace(r'\|', '$__sep__').split('|')]
-        self._wordList = np.array(extendedWordList if extendedWordList is not None else _WORDS_LOWER)
-        self._upperWordList = np.array([x.upper() for x in extendedWordList]
-                                       if extendedWordList is not None else _WORDS_UPPER)
-
-        self._np_digits_zero = np.array(_DIGITS_ZERO)
-        self._np_digits_non_zero = np.array(_DIGITS_NON_ZERO)
-        self._np_hex_upper = np.array(_HEX_UPPER)
-        self._np_hex_lower = np.array(_HEX_LOWER)
-        self._np_alnum_lower = np.array(_ALNUM_LOWER)
-        self._np_alnum_upper = np.array(_ALNUM_UPPER)
-        self._np_letters_lower = np.array(_LETTERS_LOWER)
-        self._np_letters_upper = np.array(_LETTERS_UPPER)
-        self._np_letters_all = np.array(_LETTERS_ALL)
-        self._lenWords = len(self._wordList)
-
-        # get the template metadata
-        template_info =  [self._prepareTemplateStrings(template, escapeSpecialMeaning=escapeSpecialChars)
-                                    for template in self._templates]
-        self._max_placeholders = max([ x[0] for x in template_info])
-        self._max_rnds_needed = max([ len(x[1]) for x in template_info])
-        self._placeholders_needed = [ x[0] for x in template_info]
-        self._template_rnd_bounds = [ x[1] for x in template_info]
+        :param baseDef: base value expression
+        :return: base value expression unchanged
 
+        For generate string processing , we'll use the SQL function abs(hash(baseDef)
 
-    def __repr__(self):
-        return f"TemplateGenerator(template='{self._template}')"
-
-    @property
-    def templates(self):
-        """ Get effective templates for text generator"""
-        return self._templates
+        This will ensure that even if there are multiple base values, only a single value is passed to the UDF
+        """
+        return F.abs(F.hash(baseDef))
 
     def _getRandomInt(self, low, high=-1, rng=None):
         """ generate random integer between low and high inclusive
@@ -648,7 +525,7 @@ def pandasGenerateText(self, v):
         return results
 
 
-class ILText(TextGenerator):  # lgtm [py/missing-equals]
+class ILText2(TextGenerator):  # lgtm [py/missing-equals]
     """ Class to generate Ipsum Lorem text paragraphs, words and sentences
 
     :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
index 877b9779..ae10644d 100644
--- a/dbldatagen/text_generators.py
+++ b/dbldatagen/text_generators.py
@@ -161,6 +161,17 @@ def getAsTupleOrElse(v, defaultValue, valueName):
 
         return defaultValue
 
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
+
+        :param baseDef: base value expression
+        :return: base value expression unchanged
+
+        Derived classes are expected to override this if needed
+        """
+        return baseDef
+
+
 
 class TemplateGenerator(TextGenerator):  # lgtm [py/missing-equals]
     """This class handles the generation of text from templates
diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py
new file mode 100644
index 00000000..828a7b68
--- /dev/null
+++ b/tests/test_text_generatestring.py
@@ -0,0 +1,75 @@
+import pytest
+import pandas as pd
+import numpy as np
+
+import pyspark.sql.functions as F
+from pyspark.sql.types import BooleanType, DateType
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
+
+import dbldatagen as dg
+
+spark = dg.SparkSingleton.getLocalInstance("unit tests")
+
+spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000")
+spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+
+#: list of digits for template generation
+_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+#: list of uppercase letters for template generation
+_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                  'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z']
+
+#: list of lowercase letters for template generation
+_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+                  'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+
+#: list of all letters uppercase and lowercase
+_LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER
+
+#: list of alphanumeric chars in lowercase
+_ALNUM_LOWER = _LETTERS_LOWER + _DIGITS_ZERO
+
+#: list of alphanumeric chars in uppercase
+_ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO
+
+
+# Test manipulation and generation of test data for a large schema
+class TestTextGenerateString:
+
+    @pytest.mark.parametrize("length, leadingAlpha, allUpper, allLower, allAlpha, customChars",
+                             [
+                                 (5, True, True, False, False, None),
+                                 (5, True, False, True, False, None),
+                                 (5, True, False, False, True, None),
+                                 (5, False, False, False, False, None),
+                                 (5, False, True, False, True, None),
+                                 (5, False, False, True, True, None),
+                                 (5, False, False, False, False, "01234567890ABCDEF"),
+                             ])
+    def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, customChars):
+
+        tg1 = dg.GenerateString(length, leadingAlpha=leadingAlpha, allUpper=allUpper, allLower=allLower,
+                                allAlpha=allAlpha, customChars=customChars)
+
+        assert tg1._charAlphabet is not None
+        assert tg1._firstCharAlphabet is not None
+
+        if allUpper and allAlpha:
+            alphabet = _LETTERS_UPPER
+        elif allLower and allAlpha:
+            alphabet = _LETTERS_LOWER
+        elif allLower:
+            alphabet = _LETTERS_LOWER + _DIGITS_ZERO
+        elif allUpper:
+            alphabet = _LETTERS_UPPER + _DIGITS_ZERO
+        elif allAlpha:
+            alphabet = _LETTERS_UPPER + _LETTERS_LOWER
+        else:
+            alphabet = _LETTERS_UPPER + _LETTERS_LOWER + _DIGITS_ZERO
+
+        if customChars is not None:
+            alphabet = set(alphabet).intersection(set(customChars))
+
+        assert set(tg1._charAlphabet) == set(alphabet)
+

From 1e37ff3fd7e9ef39915b718855aa444577faf8bd Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Sat, 1 Apr 2023 18:43:32 -0700
Subject: [PATCH 3/8] wip

---
 dbldatagen/text_generatestring.py | 641 ++----------------------------
 tests/test_text_generatestring.py |  20 +-
 2 files changed, 55 insertions(+), 606 deletions(-)

diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py
index 238ddf0c..82ad6e44 100644
--- a/dbldatagen/text_generatestring.py
+++ b/dbldatagen/text_generatestring.py
@@ -57,9 +57,7 @@ class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
     def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
         super().__init__()
 
-        print("testing")
-
-        assert customChars is None or isinstance(customChars, list) or isinstance(customChars, str),  \
+        assert not customChars or isinstance(customChars, list) or isinstance(customChars, str),  \
                "`customChars` should be list of characters or string containing custom chars"
 
         assert not allUpper or not allLower, "allUpper and allLower cannot both be True"
@@ -70,6 +68,11 @@ def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, al
             assert all(isinstance(c, str) for c in customChars)
             assert len(customChars) > 0, "list of customChars must be non-empty"
 
+        self.leadingAlpha = leadingAlpha
+        self.allUpper = allUpper
+        self.allLower = allLower
+        self.allAlpha = allAlpha
+
         # determine base alphabet
         if isinstance(customChars, list):
             charAlphabet = set("".join(customChars))
@@ -95,7 +98,8 @@ def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, al
 
         # compute string lengths
         if isinstance(length, int):
-            self._minLength, self._maxLength = length, length
+            self._minLength = length
+            self._maxLength = length
         elif isinstance(length, tuple):
             assert len(length) == 2, "only 2 elements can be specified if length is a tuple"
             assert all(isinstance(el, int) for el in length)
@@ -104,15 +108,30 @@ def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, al
             raise ValueError("`length` must be an integer or a tuple of two integers")
 
         # compute bounds for generated strings
-        bounds = []
-        bounds.append( len(self._firstCharAlphabet))
+        bounds = [len(self._firstCharAlphabet)]
         for ix in range(1, self._maxLength):
             bounds.append(len(self._charAlphabet))
 
         self._bounds = bounds
 
     def __repr__(self):
-        return f"GenerateString(length={length}, leadingAlpha={leadingAlpha})"
+        return f"GenerateString(length={(self._minLength, self._maxLength)}, leadingAlpha={self.leadingAlpha})"
+
+    def make_variable_length_mask(self, v, lengths):
+        """ given 2-d array of dimensions[r, c] and lengths of dimensions[r]
+
+           generate mask for each row where col_index[r,c] < lengths[r]
+        """
+        print(v.shape, lengths.shape)
+        assert v.shape[0] == lengths.shape[0], "values and lengths must agree on dimension 0]"
+        _, c_ix = np.indices(v.shape)
+
+        return (c_ix.T < lengths.T).T
+
+    def mk_bounds(self, v, minLength, maxLength):
+        rng = default_rng(42)
+        v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1)
+        return rng.integers(v_bounds) + minLength
 
     def prepareBaseValue(self, baseDef):
         """ Prepare the base value for processing
@@ -126,349 +145,6 @@ def prepareBaseValue(self, baseDef):
         """
         return F.abs(F.hash(baseDef))
 
-    def _getRandomInt(self, low, high=-1, rng=None):
-        """ generate random integer between low and high inclusive
-
-        :param low: low value, if no high value is specified, treat low value as high value and low of 0
-        :param high: high value for random number generation
-        :param rng: if provided, an instance of a numpy random number generator
-        :return: generated value
-        """
-        if high == -1:
-            high = low
-            low = 0
-
-        if rng is not None:
-            # numpy interval is different to ``randint``
-            return rng.integers(low, high + 1, dtype=np.int32)
-
-        # use standard random for now as it performs better when generating values one at a time
-        return random.randint(low, high)
-
-    def _prepareTemplateStrings(self, genTemplate, escapeSpecialMeaning=False):
-        """ Prepare list of random numbers needed to generate template in vectorized form
-
-        :param genTemplate: template string to control text generation
-        :param escapeSpecialMeaning: if True, requires escape on special meaning chars.
-        :returns: tuple containing number of placeholders and vector of random values upper bounds
-
-        The first element of the tuple is the number of placeholders needed to populate the template
-
-        The second elememt is a vector of integer values which determine bounds for random number vector for
-        template generation
-
-        Each element of the vector will be used to generate a random number between 0 and the element inclusive,
-        which is then used to select words from wordlists etc for template expansion
-
-        `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing
-        for preferred new style template syntax. Specify as True to force escapes for special meanings,.
-
-        """
-        retval = []
-
-        escape = False
-        use_value = False
-        template_len = len(genTemplate)
-        num_placeholders = 0
-
-        # in the following code, the construct `(not escape) ^ self._escapeSpecialMeaning` means apply
-        # special meaning if either escape is not true or the option `self._escapeSpecialMeaning` is true.
-        # This corresponds to the logical xor operation
-        for i in range(0, template_len):
-            char = genTemplate[i]
-            following_char = genTemplate[i + 1] if i + 1 < template_len else None
-
-            if char == '\\':
-                escape = True
-            elif use_value and ('0' <= char <= '9'):
-                # val_index = int(char)
-                # retval.append(str(baseValue[val_index]))
-                num_placeholders += 1
-                use_value = False
-            elif char == 'x' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(16)
-                num_placeholders += 1
-                # used for retval.append(_HEX_LOWER[self._getRandomInt(0, 15, rndGenerator)])
-            elif char == 'X' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(16)
-                num_placeholders += 1
-                # retval.append(_HEX_UPPER[self._getRandomInt(0, 15, rndGenerator)])
-            elif char == 'd' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(10)
-                num_placeholders += 1
-                # retval.append(_DIGITS_ZERO[self._getRandomInt(0, 9, rndGenerator)])
-            elif char == 'D' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(9)
-                num_placeholders += 1
-                # retval.append(_DIGITS_NON_ZERO[self._getRandomInt(0, 8, rndGenerator)])
-            elif char == 'a' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(26)
-                num_placeholders += 1
-                # retval.append(_LETTERS_LOWER[self._getRandomInt(0, 25, rndGenerator)])
-            elif char == 'A' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(26)
-                num_placeholders += 1
-                # retval.append(_LETTERS_UPPER[self._getRandomInt(0, 25, rndGenerator)])
-            elif char == 'k' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(26)
-                num_placeholders += 1
-                # retval.append(_ALNUM_LOWER[self._getRandomInt(0, 35, rndGenerator)])
-            elif char == 'K' and (not escape) ^ escapeSpecialMeaning:
-                retval.append(36)
-                num_placeholders += 1
-                # retval.append(_ALNUM_UPPER[self._getRandomInt(0, 35, rndGenerator)])
-            elif char == 'n' and escape:
-                retval.append(256)
-                num_placeholders += 1
-                # retval.append(str(self._getRandomInt(0, 255, rndGenerator)))
-                escape = False
-            elif char == 'N' and escape:
-                retval.append(65536)
-                num_placeholders += 1
-                # retval.append(str(self._getRandomInt(0, 65535, rndGenerator)))
-                escape = False
-            elif char == 'W' and escape:
-                retval.append(self._lenWords)
-                num_placeholders += 1
-                # retval.append(self._upperWordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
-                escape = False
-            elif char == 'w' and escape:
-                retval.append(self._lenWords)
-                num_placeholders += 1
-                # retval.append(self._wordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
-                escape = False
-            elif char == 'v' and escape:
-                escape = False
-                if following_char is not None and ('0' <= following_char <= '9'):
-                    use_value = True
-                else:
-                    num_placeholders += 1
-                    # retval.append(str(baseValue))
-            elif char == 'V' and escape:
-                # retval.append(str(baseValue))
-                num_placeholders += 1
-                escape = False
-            else:
-                # retval.append(char)
-                num_placeholders += 1
-                escape = False
-
-        if use_value:
-            # retval.append(str(baseValue))
-            num_placeholders += 1
-
-        return num_placeholders, retval
-
-    def _applyTemplateStringsForTemplate(self, baseValue, genTemplate, placeholders, rnds, escapeSpecialMeaning=False):
-        """ Vectorized implementation of template driven text substitution
-
-         Apply substitutions to placeholders using random numbers
-
-        :param baseValue: Pandas series or data frame of base value for applying template
-        :param genTemplate: template string to control text generation
-        :param placeholders: masked nparray of type np.object_ pre-allocated to hold strings emitted
-        :param rnds: masked numpy 2d array of random numbers needed for vectorized generation
-        :param escapeSpecialMeaning: if True, requires escape on special meaning chars.
-        :returns: placeholders
-
-        The vectorized implementation populates the placeholder Numpy array with the substituted values.
-
-        `_escapeSpecialMeaning` parameter allows for backwards compatibility with old style syntax while allowing
-        for preferred new style template syntax. Specify as True to force escapes for special meanings,.
-
-        """
-        assert baseValue.shape[0] == placeholders.shape[0]
-        assert baseValue.shape[0] == rnds.shape[0]
-
-        _cached_values = {}
-
-        def _get_values_as_np_array():
-            """Get baseValue which is pd.Series or Dataframe as a numpy array and cache it"""
-            if "np_values" not in _cached_values:
-                _cached_values["np_values"] = baseValue.to_numpy()
-
-            return _cached_values["np_values"]
-
-        def _get_values_subelement(elem):
-            """Get element from base values as np array and cache it"""
-            cache_key = f"v_{elem}"
-            if cache_key not in _cached_values:
-                np_values = _get_values_as_np_array()
-                #element_values = []
-                element_values = np.ndarray( np_values.shape[0], dtype=np_values.dtype)
-
-                for x in range(baseValue.shape[0]):
-                    #element_values.append(baseValue[x][elem])
-                    element_values[x] = baseValue[x][elem]
-                _cached_values[cache_key] = element_values
-
-            return _cached_values[cache_key]
-
-        escape = False
-        use_value = False
-        template_len = len(genTemplate)
-        num_placeholders = 0
-        rnd_offset = 0
-
-        # in the following code, the construct `(not escape) ^ self._escapeSpecialMeaning` means apply
-        # special meaning if either escape is not true or the option `self._escapeSpecialMeaning` is true.
-        # This corresponds to the logical xor operation
-        for i in range(0, template_len):
-            char = genTemplate[i]
-            following_char = genTemplate[i + 1] if i + 1 < template_len else None
-
-            if char == '\\':
-                escape = True
-            elif use_value and ('0' <= char <= '9'):
-                val_index = int(char)
-                placeholders[:, num_placeholders] = _get_values_subelement(val_index)
-                #placeholders[:, num_placeholders] = pd_base_values.apply(lambda x: str(x[val_index]))
-                num_placeholders += 1
-                use_value = False
-            elif char == 'x' and (not escape) ^ escapeSpecialMeaning:
-                # note vectorized lookup - `rnds[:, rnd_offset]` will get vertical column of
-                # random numbers from `rnds` 2d array
-                placeholders[:, num_placeholders] = self._np_hex_lower[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # used for retval.append(_HEX_LOWER[self._getRandomInt(0, 15, rndGenerator)])
-            elif char == 'X' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_hex_upper[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_HEX_UPPER[self._getRandomInt(0, 15, rndGenerator)])
-            elif char == 'd' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_digits_zero[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_DIGITS_ZERO[self._getRandomInt(0, 9, rndGenerator)])
-            elif char == 'D' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_digits_non_zero[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_DIGITS_NON_ZERO[self._getRandomInt(0, 8, rndGenerator)])
-            elif char == 'a' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_letters_lower[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_LETTERS_LOWER[self._getRandomInt(0, 25, rndGenerator)])
-            elif char == 'A' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_letters_upper[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_LETTERS_UPPER[self._getRandomInt(0, 25, rndGenerator)])
-            elif char == 'k' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_alnum_lower[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_ALNUM_LOWER[self._getRandomInt(0, 35, rndGenerator)])
-            elif char == 'K' and (not escape) ^ escapeSpecialMeaning:
-                placeholders[:, num_placeholders] = self._np_alnum_upper[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(_ALNUM_UPPER[self._getRandomInt(0, 35, rndGenerator)])
-            elif char == 'n' and escape:
-                placeholders[:, num_placeholders] = rnds[:, rnd_offset]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(str(self._getRandomInt(0, 255, rndGenerator)))
-                escape = False
-            elif char == 'N' and escape:
-                placeholders[:, num_placeholders] = rnds[:, rnd_offset]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(str(self._getRandomInt(0, 65535, rndGenerator)))
-                escape = False
-            elif char == 'W' and escape:
-                placeholders[:, num_placeholders] = self._upperWordList[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(self._upperWordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
-                escape = False
-            elif char == 'w' and escape:
-                placeholders[:, num_placeholders] = self._wordList[rnds[:, rnd_offset]]
-                num_placeholders += 1
-                rnd_offset = rnd_offset + 1
-                # retval.append(self._wordList[self._getRandomWordOffset(self._lenWords, rndGenerator=rndGenerator)])
-                escape = False
-            elif char == 'v' and escape:
-                escape = False
-                if following_char is not None and ('0' <= following_char <= '9'):
-                    use_value = True
-                else:
-                    placeholders[:, num_placeholders] = _get_values_as_np_array()
-                    num_placeholders += 1
-                    # retval.append(str(baseValue))
-            elif char == 'V' and escape:
-                placeholders[:, num_placeholders] = _get_values_as_np_array()
-                # retval.append(str(baseValue))
-                num_placeholders += 1
-                escape = False
-            else:
-                placeholders[:, num_placeholders] = char
-                # retval.append(char)
-                num_placeholders += 1
-                escape = False
-
-        if use_value:
-            placeholders[:, num_placeholders] = _get_values_as_np_array()
-            # retval.append(str(baseValue))
-            num_placeholders += 1
-
-        return placeholders
-
-    def classicGenerateText(self, v):
-        """entry point to use for classic udfs"""
-
-        pdValues = pd.Series([v])
-        results = self.pandasGenerateText(pdValues)
-        return results[0]
-
-    def _prepare_random_bounds(self, v):
-        """
-        Prepare the random bounds for processing of the template expansion
-
-        For each template, we will have a vector of random numbers to generate for expanding the template
-
-        If we have multiple templates, there will be a separate vector of random numbers for each template
-
-
-        :param v: Pandas series of values passed as base values
-        :return: vector of templates chosen, template random bounds (1 for each substitution) and selected
-                 random numbers for each row (as numpy array)
-        """
-        # choose templates
-        num_templates = len(self.templates)
-        assert num_templates >= 1, "Expecting at least 1 template"
-
-        rng = self.getNPRandomGenerator()
-
-        if num_templates > 1:
-            # choose template at random from 0 .. num_templates
-            templates_chosen = rng.integers(np.full(v.size, num_templates))
-        else:
-            # always use template 0
-            templates_chosen = np.full(v.size, 0)
-
-        # populate template random numbers
-        template_rnd_bounds = np.full((v.size, self._max_rnds_needed), -1)
-        masked_template_bounds = np.ma.MaskedArray(template_rnd_bounds, mask=False)
-
-        for i in range(num_templates):
-            # assign the
-            len_bounds_i = len(self._template_rnd_bounds[i])
-            masked_template_bounds[templates_chosen.T == i, 0:len_bounds_i] = self._template_rnd_bounds[i]
-
-        masked_template_bounds[template_rnd_bounds == -1] = np.ma.masked
-
-        r_mask = masked_template_bounds.mask
-
-        template_rnds = template_rnd_bounds.copy()
-
-        template_rnds[~r_mask] = rng.integers(masked_template_bounds[~r_mask])
-
-        return templates_chosen, template_rnd_bounds, template_rnds
-
     def pandasGenerateText(self, v):
         """ entry point to use for pandas udfs
 
@@ -479,267 +155,26 @@ def pandasGenerateText(self, v):
 
         """
         # placeholders is numpy array used to hold results
-        placeholders = np.full((v.shape[0], self._max_placeholders), '', dtype=np.object_)
-
-        # prepare template selections, bounds, rnd values to drive application of algorithm
-        template_choices, template_rnd_bounds, template_rnds = self._prepare_random_bounds(v)
-        template_choices_t = template_choices.T
-
-        # create masked arrays, with all elements initially masked
-        # as we substitute template expansion, we'll mask and unmask rows corresponding to each template
-        # calling the method to substitute the values on the masked placeholders
-        masked_placeholders = np.ma.MaskedArray(placeholders, mask=False)
-        masked_rnds = np.ma.MaskedArray(template_rnds, mask=False)
-        # masked_base_values = np.ma.MaskedArray(baseValues, mask=False)
-        masked_matrices = [masked_placeholders, masked_rnds]
-
-        # test logic for template expansion
-        for x in range(len(self._templates)):
-            masked_placeholders[template_choices_t != x, :] = np.ma.masked
-            masked_rnds[template_choices_t != x, :] = np.ma.masked
-            #masked_base_values[template_choices_t != x] = np.ma.masked
-
-            # harden mask, preventing modifications
-            for m in masked_matrices:
-                np.ma.harden_mask(m)
-
-            # expand values into placeholders
-            #self._applyTemplateStringsForTemplate(v.to_numpy(dtype=np.object_), #masked_base_values,
-            self._applyTemplateStringsForTemplate(v,
-                                                    #masked_base_values,
-                                                    self._templates[x],
-                                                    masked_placeholders,
-                                                    masked_rnds,
-                                                    escapeSpecialMeaning=self._escapeSpecialMeaning
-                                                    )
-
-            # soften and clear mask, allowing modifications
-            for m in masked_matrices:
-                np.ma.soften_mask(m)
-                m.mask = False
 
-        # join strings in placeholders
-        output = pd.Series(list(placeholders))
-        results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))
-
-        return results
+        rnds = np.full((v.shape[0], self._maxLength), len(self._charAlphabet), dtype=np.object_)
 
+        rng = self.getNPRandomGenerator()
+        rnds2 = rng.integers(rnds)
 
-class ILText2(TextGenerator):  # lgtm [py/missing-equals]
-    """ Class to generate Ipsum Lorem text paragraphs, words and sentences
-
-    :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range
-    :param sentences:  Number of sentences to generate. If tuple will generate random number in tuple range
-    :param words:  Number of words per sentence to generate. If tuple, will generate random number in tuple range
-
-    """
-
-    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None):
-        """
-        Initialize the ILText with text generation parameters
-        """
-        assert paragraphs is not None or sentences is not None or words is not None, \
-            "At least one of the params `paragraphs`, `sentences` or `words` must be specified"
-
-        super().__init__()
-
-        self.paragraphs = self.getAsTupleOrElse(paragraphs, (1, 1), "paragraphs")
-        self.words = self.getAsTupleOrElse(words, (2, 12), "words")
-        self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences")
-        self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER
-        self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]]
-
-        # values needed for the text generation
-        # numpy uses fixed sizes for strings , so compute whats needed
-        self._npWords = np.array(self.wordList)
-
-        self._processStats()
-        self._processWordList()
-
-    def _processStats(self):
-        """ Compute the stats needed for the text generation """
-
-        vals = [self.paragraphs, self.sentences, self.words]
-        self._textGenerationValues = np.array(vals, dtype=self.compactNumpyTypeForValues(vals))
-        self._minValues = self._textGenerationValues[:, 0]
-        self._maxValues = self._textGenerationValues[:, 1]
-
-        self._meanValues = np.mean(self._textGenerationValues, axis=1)
-
-        # we want to force wider spread of sentence length, so we're not simply computing the std_deviation
-        # - but computing a target std_dev that will spread sentence length
-        self._stdVals = self._meanValues / 2
-        self._stdVals2 = np.std(self._textGenerationValues, axis=1)
-
-    def _processWordList(self):
-        """ Set up the word lists"""
-        np_words = np.array(self.wordList, np.dtype(np.str_))
-        np_capitalized_words = np.char.capitalize(np_words[:])
-
-        all_words = np_words[:]
-
-        self._wordOffsetSize = all_words.size
-        self._sentenceEndOffset = all_words.size
-        self._paragraphEnd = self._sentenceEndOffset + 1
-        self._wordSpaceOffset = self._paragraphEnd + 1
-        self._emptyStringOffset = self._wordSpaceOffset + 1
-
-        punctuation = [". ", "\n\n", " ", ""]
-        all_words = np.concatenate((all_words, punctuation))
-
-        self._startOfCapitalsOffset = all_words.size
-        all_words = np.concatenate((all_words, np_capitalized_words, punctuation))
-
-        # for efficiency, we'll create list of words preceded by spaces - it will reduce memory consumption during join
-        # and array manipulation as we dont have to hold offset for space
-        self._startOfSpacedWordsOffset = all_words.size
-
-        np_spaced_words = np.array([" " + x for x in self.wordList], np.dtype(np.str_))
-        all_words = np.concatenate((all_words, np_spaced_words, punctuation))
-
-        # set up python list of all words so that we dont have to convert between numpy and python representations
-        self._allWordsSize = all_words.size
-        self._wordsAsPythonStrings = list([str(x) for x in all_words])
-
-        # get smallest type that can represent word offset
-        self._wordOffsetType = self.compactNumpyTypeForValues([all_words.size * 2 + 10])
-
-    def __repr__(self):
-        paras, sentences, words = self.paragraphs, self.sentences, self.words
-        wl = self.wordList.__repr__ if self.wordList is not None else "None"
-        return f"ILText(paragraphs={paras}, sentences={sentences}, words={words}, wordList={wl})"
-
-    def generateText(self, baseValues, rowCount=1):
-        """
-        generate text for seed based on configuration parameters.
-
-        As it uses numpy, repeatability is restricted depending on version of the runtime
+        placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_)
 
-        :param baseValues: list or array-like list of baseValues
-        :param rowCount: number of rows
-        :returns: list or Pandas series of generated strings of same size as input seed
-        """
-        assert baseValues is not None, "`baseValues` param must be specified"
-        rng = self.getNPRandomGenerator(forceNewInstance=True)
-        word_offset_type = self._wordOffsetType
-
-        stats_shape = [rowCount, self.paragraphs[1], self.sentences[1], 3]
-
-        # determine counts of paragraphs, sentences and words
-        stats_array = np.empty(stats_shape, dtype=self._textGenerationValues.dtype)
-        para_stats_raw = np.round(rng.normal(self._meanValues, self._stdVals2, size=stats_shape))
-        para_stats = np.clip(para_stats_raw, self._minValues, self._maxValues, out=stats_array)
-
-        # replicate paragraphs and sentences from first row of each paragraph through other elememnts
-        # this is used to efficiently create mask for manipulating word offsets
-        # after this every row will contain :
-        # number of good paragraphs, number of good sentences, number of number of good words
-        # for current sentence in current paragraph in outer rows collection
-        para_stats[:, :, :, 0] = para_stats[:, :, 0, 0, np.newaxis]
-        para_stats[:, :, :, 1] = para_stats[:, :, 0, 1, np.newaxis]
-
-        # set up shape for arrays used to process word offsets
-        # this will be masked so that we don't waste resources processing invalid paragraphs, sentences and words
-        output_shape = (rowCount, self.paragraphs[1], self.sentences[1], self.words[1])
-
-        # compute the masks for paragraphs, sentences, and words
-
-        # get the set of indices for shape  - r = rows, p = paragraphs, s = sentences, w = words
-        # the indices will produce a set of rows of values for each dimension
-        # the mask is then produced by iterating comparing index with good value
-        # for example - if number of good words is 3 and sentence is index array of [0, 1, 2, 3, 4, 5 ,6]
-        # then mask is produced via conditions indices <= good_word
-        # note value of True means masked for numpy
-        r, p, s, w = np.indices(output_shape)
-
-        good_words = para_stats[:, :, :, 2]
-        good_paragraphs = para_stats[:, :, :, 0]
-        good_sentences = para_stats[:, :, :, 1]
-
-        # build masks in each dimension and `or` them together
-        words_mask = (w.T >= good_words.T).T
-        para_mask = (p.T >= good_paragraphs.T).T
-        sentences_mask = (s.T >= good_sentences.T).T
-        final_mask = words_mask | para_mask | sentences_mask
-
-        word_offsets = np.full(output_shape, dtype=word_offset_type, fill_value=self._emptyStringOffset)
-        masked_offsets = np.ma.MaskedArray(word_offsets, mask=final_mask)
-
-        # note numpy random differs from standard random in that it never produces upper bound
-        masked_offsets[~masked_offsets.mask] = rng.integers(self._wordOffsetSize,
-                                                            size=output_shape,
-                                                            dtype=self._wordOffsetType)[~masked_offsets.mask]
-
-        # hardening a mask prevents masked values from being changed
-        np.ma.harden_mask(masked_offsets)
-        masked_offsets[:, :, :, 0] = masked_offsets[:, :, :, 0] + self._startOfCapitalsOffset
-        masked_offsets[:, :, :, 1:] = masked_offsets[:, :, :, 1:] + self._startOfSpacedWordsOffset
-        np.ma.soften_mask(masked_offsets)
-
-        # add period to every sentence
-        # we'll replicate column 0 in order to preserve the mask and fill unmasked entries
-        # with the sentence end offset
-        new_word_offsets = masked_offsets[:, :, :, 0][:]
-        new_col = new_word_offsets[:, :, :, np.newaxis]
-        terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3)
-        new_column = terminated_word_offsets[:, :, :, -1]
-        new_column[~new_column.mask] = self._sentenceEndOffset
-
-        # reshape to paragraphs
-        shape = terminated_word_offsets.shape
-        paragraph_offsets = terminated_word_offsets.reshape((rowCount, shape[1], shape[2] * shape[3]))
-
-        if self.paragraphs[1] > 1:
-            # add paragraph terminator to every paragraph
-            # we'll take a copy of the first column so as to preserve masking
-            # i.e if first word of first sentence is masked, then end marker position should be masked
-            new_word_offsets = paragraph_offsets[:, :, 0][:]
-            new_col = new_word_offsets[:, :, np.newaxis]
-            terminated_paragraph_offsets = np.ma.concatenate((paragraph_offsets, new_col), axis=2)
-
-            # set the paragraph end marker on all paragraphs except last
-            # new_masked_elements = terminated_paragraph_offsets[:,:,-1]
-            new_column = terminated_paragraph_offsets[:, :, -1]
-            new_column[~new_column.mask] = self._paragraphEnd
-        else:
-            terminated_paragraph_offsets = paragraph_offsets
+        lengths = (v.to_numpy() % (self._maxLength - self._minLength) + self._minLength)
 
-        # reshape to rows containing word offset sequences. We'll join using pandas apply to avoid
-        # memory issues with fixed strings in numpy
-        shape = terminated_paragraph_offsets.shape
-        terminated_paragraph_offsets = terminated_paragraph_offsets.reshape((rowCount, shape[1] * shape[2]))
+        v1 = np.full((v.shape[0], self._maxLength), -1)
 
-        final_data = terminated_paragraph_offsets.filled(fill_value=self._emptyStringOffset)
+        placeholder_mask = self.make_variable_length_mask(placeholders, lengths)
+        masked_placeholders = np.ma.MaskedArray(placeholders, mask=placeholder_mask)
 
-        # its faster to manipulate text in data frames as numpy strings are fixed length
-        all_python_words = self._wordsAsPythonStrings
+        masked_placeholders[~placeholder_mask] = self._charAlphabet[rnds2[~placeholder_mask]]
 
-        base_results = pd.DataFrame(final_data)
+        output = pd.Series(list(placeholders))
 
-        # build our lambda expression, copying point to word list locally for efficiency
-        empty_string_offsets = [self._emptyStringOffset, self._emptyStringOffset + self._startOfSpacedWordsOffset]
-        mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x if x1 not in empty_string_offsets])).strip()
-        # mk_str_fn = lambda x: ("".join([all_python_words[x1] for x1 in x ]))
+        # join strings in placeholders
+        results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))
 
-        # ... and execute it
-        results = base_results.apply(mk_str_fn, axis=1)
         return results
-
-    def classicGenerateText(self, v):
-        """
-        classic udf entry point for text generation
-
-        :param v: base value to control generation of random numbers
-        """
-        return self.generateText([v], 1)[0]
-
-    def pandasGenerateText(self, v):
-        """
-        pandas udf entry point for text generation
-
-        :param v: pandas series of base values for random text generation
-        :returns: Pandas series of generated strings
-        """
-        rows = v.to_numpy()
-        results = self.generateText(rows, rows.size)
-        return pd.Series(results)
diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py
index 828a7b68..993d8af6 100644
--- a/tests/test_text_generatestring.py
+++ b/tests/test_text_generatestring.py
@@ -1,7 +1,4 @@
 import pytest
-import pandas as pd
-import numpy as np
-
 import pyspark.sql.functions as F
 from pyspark.sql.types import BooleanType, DateType
 from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
@@ -73,3 +70,20 @@ def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, custom
 
         assert set(tg1._charAlphabet) == set(alphabet)
 
+    def test_simple_data(self):
+        dgspec = (dg.DataGenerator(sparkSession=spark, name="alt_data_set", rows=10000,
+                                   partitions=4, seedMethod='hash_fieldname', verbose=True,
+                                   seedColumnName="_id")
+                  .withIdOutput()
+                  .withColumn("code2", IntegerType(), min=0, max=10)
+                  .withColumn("code3", StringType(), values=['a', 'b', 'c'])
+                  .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
+                  .withColumn("code5", StringType(), text=dg.GenerateString( (1, 10) ))
+                  )
+
+        fieldsFromGenerator = set(dgspec.getOutputColumnNames())
+
+        df_testdata = dgspec.build()
+
+        df_testdata.show()
+

From 5794e03022cd4c720e2508adea545c6d1efbbdbc Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Sun, 2 Apr 2023 09:51:51 -0700
Subject: [PATCH 4/8] wip

---
 tests/test_text_generatestring.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py
index 993d8af6..8db08b5c 100644
--- a/tests/test_text_generatestring.py
+++ b/tests/test_text_generatestring.py
@@ -70,7 +70,18 @@ def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, custom
 
         assert set(tg1._charAlphabet) == set(alphabet)
 
-    def test_simple_data(self):
+    @pytest.mark.parametrize("genstr",
+                             [
+                                 dg.GenerateString((1, 10)),
+                                 dg.GenerateString((1, 10), leadingAlpha=True),
+                                 dg.GenerateString((4, 64), allUpper=True),
+                                 dg.GenerateString((10, 20), allLower=True),
+                                 dg.GenerateString((1, 10)),
+                                 dg.GenerateString((3, 15)),
+                                 dg.GenerateString((17,22)),
+                                 dg.GenerateString((1, 10)),
+                             ])
+    def test_simple_data(self, genstr):
         dgspec = (dg.DataGenerator(sparkSession=spark, name="alt_data_set", rows=10000,
                                    partitions=4, seedMethod='hash_fieldname', verbose=True,
                                    seedColumnName="_id")

From 92f19b5acd0a988fc88d3f0040f5230424b3fd08 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 4 Apr 2023 10:49:37 -0700
Subject: [PATCH 5/8] wip

---
 dbldatagen/__init__.py         |   3 +-
 dbldatagen/value_based_prng.py | 125 +++++++++++++++++++++++++++++++++
 docs/utils/mk_quick_index.py   |   4 ++
 tests/test_value_based_PRNG.py | 113 +++++++++++++++++++++++++++++
 4 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 dbldatagen/value_based_prng.py
 create mode 100644 tests/test_value_based_PRNG.py

diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
index 8d9937f3..906b06f2 100644
--- a/dbldatagen/__init__.py
+++ b/dbldatagen/__init__.py
@@ -41,11 +41,12 @@
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
 from .text_generatestring import GenerateString
+from .value_based_prng import ValueBasedPRNG
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "text_generatestring"
+           "text_generator_plugins", "text_generatestring", "value_based_prng"
            ]
 
 
diff --git a/dbldatagen/value_based_prng.py b/dbldatagen/value_based_prng.py
new file mode 100644
index 00000000..c2e2293a
--- /dev/null
+++ b/dbldatagen/value_based_prng.py
@@ -0,0 +1,125 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This module defines the ValueBasedPRNG class
+
+The Value Based PRNG (psuedo random number generator) uses the PCG algorithm  to generate repeatable
+psuedo random numbers. It achieves excellent statistical performance with small and fast code, and small state size.
+
+More details of the PCG algorithnm can be found here.
+
+https://www.pcg-random.org/index.html
+
+The implementation used here, uses as a numpy array of values to seed different random seeds for each row
+so that generating pseudo random sequences across each row of a numpy 2d array are completely repeatable.
+
+This differs from existing Numpy implementations as in existing Numpy random number generators,
+each row does not have a separate random seed.
+
+The goal of this implementation is to ensure that the sequence of psuedo-random numbers generated for a given row
+are independent of the value of the seed in other rows.
+
+In practice, this makes it ideal for generating repeatable foreign keys for a given primary key.
+
+"""
+
+import numpy as np
+
+class ValueBasedPRNG(object):
+    """ Value based psuedo-random number generator designed to generate repeatable random sequences.
+
+        It uses the supplied array `v` to seed each a set of states for generating random numbers in sequences of shape
+        where the first dimension is a multiple of the size of the seed array `v`.
+
+        The overall goal is to provide a prng that is replacement for the numpy random number generator where only
+        the `integers` method is used.
+
+        Note that by array-like, we mean a list, a numpy array or a numeric value
+    """
+
+    _MASK32 = 2 ** 32 - 1  # uint32_t
+    _MASK64 = 2 ** 64 - 1  # uint64_t
+
+    def __init__(self, v, shape=None, dtype=None):
+        """
+
+        :param v: array-like list of values to seed the number generator
+        :param shape: If supplied, indicates that generation should allow for generations of values of supplied shape
+        :param dtype: Numpy dtype for generation of random numbers
+
+        If a shape is supplied, it creates sufficient state to allow for generation of the number of values indicated
+        by the shape parameter independently. In effect, each row has a separate random generator and each column will
+        be generated by using a separate stream.
+
+        This may raise a TypeError if the values cannot be converted to integers.
+
+        If converting from string values, it expects that strings can be converted to ints, and if not
+        will raise a ValueError.
+        """
+        effective_values = np.array(v)  # if v is already numpy array, its a no-op
+
+        if effective_values.dtype not in [ np.int32, np.uint32, np.int64, np.uint64]:
+            if np.can_cast(effective_values, np.int64, casting="unsafe"):
+                effective_values = effective_values.astype(np.int64, casting="unsafe")
+            else:
+                raise TypeError("Cant cast values to np.int64")
+
+        self._v = effective_values
+        self._dtype = dtype or effective_values.dtype
+
+        # scalars have a numpy shape of ()
+        self._shape = shape or effective_values.shape
+        self._internal_shape = self._shape
+
+        # internally treat scalars as array of size 1
+        if self._v.shape == ():
+            self._internal_shape = (1,)
+
+    @property
+    def shape(self):
+        """get the `shape` attribute"""
+        return self._shape
+
+    @property
+    def seedValues(self):
+        """ Get the values that were used to seed the PRNG"""
+        return self._v
+        
+
+    def integers(self, low, high=None, size=None, dtype=None, endpoint=False):
+        """ Return psuedo-random integers from low (inclusive) to high (exclusive), or if endpoint=True,
+            low (inclusive) to high (inclusive).
+
+        :param low:      lowint or array-like of ints. Lowest (signed) integers to be drawn from the distribution
+                         (unless high=None, in which case this parameter is 0 and this value is used for high).
+        :param high:     highint or array-like of ints, optional. f provided, one above the largest (signed) integer
+                         to be drawn from the distribution (see above for behavior if high=None).
+                         If array-like, must contain integer values
+        :param size:     int or tuple of ints, optional that defines the output shape.
+                         If the given shape is, e.g., (m, n, k), then m * n * k samples are drawn.
+                         Default is None, in which case the size of the set of values used to initialize the PRNG
+                         is used.
+        :param dtype:    Desired Numpy dtype of the result. Byteorder must be native. The default value is np.int64
+        :param endpoint: If true, sample from the interval [low, high] instead of the default [low, high).
+                         Defaults to False
+        :return:         ndarray of ints. size-shaped array of random integers from the appropriate distribution,
+                         or a single such random int if size not provided.
+
+        """
+        low_arr = np.array(low)
+
+        if high is None:
+            high_arr = low_arr
+            low_arr = np.full(low_arr.shape, 0)
+        else:
+            high_arr = np.array(high)
+
+        if endpoint:
+            high_arr = high_arr + 1
+
+        if np.any(high_arr <= 0):
+            raise ValueError("high <= 0")
+
+        return low_arr.reshape(low.shape)
diff --git a/docs/utils/mk_quick_index.py b/docs/utils/mk_quick_index.py
index 9342c883..772a8f1b 100644
--- a/docs/utils/mk_quick_index.py
+++ b/docs/utils/mk_quick_index.py
@@ -33,10 +33,14 @@
                            "grouping": "main classes"},
     "text_generator_plugins.py": {"briefDesc": "Text data generation",
                                   "grouping": "main classes"},
+    "text_generatestring.py": {"briefDesc": "Text data generation",
+                           "grouping": "main classes"},
     "data_analyzer.py": {"briefDesc": "Analysis of existing data",
                          "grouping": "main classes"},
     "function_builder.py": {"briefDesc": "Internal utilities to create functions related to weights",
                             "grouping": "internal classes"},
+    "value_based_prng.py": {"briefDesc": "Value based pseudo-random number generator",
+                         "grouping": "internal classes"},
     "schema_parser.py": {"briefDesc": "Internal utilities to parse Spark SQL schema information",
                          "grouping": "internal classes"},
     "spark_singleton.py": {"briefDesc": "Spark singleton for test purposes",
diff --git a/tests/test_value_based_PRNG.py b/tests/test_value_based_PRNG.py
new file mode 100644
index 00000000..164e0b8e
--- /dev/null
+++ b/tests/test_value_based_PRNG.py
@@ -0,0 +1,113 @@
+import re
+import pytest
+import pandas as pd
+import numpy as np
+
+import pyspark.sql.functions as F
+
+import pyspark.sql.functions as F
+
+from pyspark.sql.types import BooleanType, DateType
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
+
+import dbldatagen as dg
+from dbldatagen import TemplateGenerator, TextGenerator
+
+# add the following if using pandas udfs
+#    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "1000") \
+
+
+spark = dg.SparkSingleton.getLocalInstance("unit tests")
+
+
+# Test manipulation and generation of test data for a large schema
+class TestValueBasedPRNG:
+
+    def test_basics(self):
+        arr = np.arange(100)
+
+        rng = dg.ValueBasedPRNG(arr, shape=(len(arr), 10) )
+
+        assert rng is not None
+
+        assert rng.shape == (len(arr), 10)
+
+        assert rng.seedValues is not None and np.array_equal(rng.seedValues, arr)
+
+
+    @pytest.mark.parametrize("data, shape, bounds, endpoint",
+                             [
+                                 (np.arange(1024), (1024, 10), 255, False),
+                                 (np.arange(1024), (1024, 15), np.arange(15) * 3 + 1, False),
+                                 (np.arange(1024), (1024, ), 255, False),
+                                 (np.arange(1024), (1024, 10), 255, False),
+                                 (np.arange(1024), (1024, 15), np.arange(15) * 3 + 1, False),
+                                 (1, (1, 5), [1,2,3,4,5], False),
+                                 (10, None, [1, 2, 3, 4, 5], False),
+                             ])
+    def test_integers(self, data, shape, bounds, endpoint):
+
+        arr = np.array(data)
+        rng = dg.ValueBasedPRNG(arr, shape=shape)
+
+        bounds_arr = np.full(shape, bounds)
+
+        results = rng.integers(bounds_arr, endpoint=endpoint)
+
+        assert results is not None , "results should be not None"
+        assert results.shape == bounds_arr.shape, "results should be of target shape"
+
+        # make sure that we have results that are always simply the passed in bounds
+        assert not np.array_equal(results, bounds_arr)
+
+    @pytest.mark.parametrize("data, shape",
+                             [
+                                 (23, (1024, 10) ),
+                                 (23.10, (1024, 15) ),
+                                 (23, None),
+                                 (23.10, None),
+                             ])
+    def test_initialization_success(self, data, shape):
+
+        arr = np.array(data)
+        rng = dg.ValueBasedPRNG(arr, shape=shape)
+
+        assert rng is not None
+
+    @pytest.mark.parametrize("data, shape",
+                             [
+                                 ("test", (1024, )),
+                                 ("test", None),
+                             ])
+    def test_initialization_fail_value(self, data, shape):
+
+        with pytest.raises(ValueError):
+            arr = np.array(data)
+            rng = dg.ValueBasedPRNG(arr, shape=shape)
+
+            assert rng is not None
+
+
+    def test_udfs(self):
+
+        def exampleUdf(v):
+            v1 = pd.DataFrame(v)
+            mk_str_fn = lambda x: str(hash(tuple(x)))  #str(x)
+            results = v1.apply(mk_str_fn, axis=1)
+            return pd.Series(results)
+
+        testUdf = F.pandas_udf(exampleUdf, returnType=StringType()).asNondeterministic()
+
+        df = (spark.range(1024)
+              .withColumn("v1", F.expr("id * id"))
+              .withColumn("v2", F.expr("id * id"))
+              .withColumn("v3", F.expr("id * id"))
+              .withColumn("v4", F.expr("'test'"))
+              .withColumn("v5", testUdf(F.array(F.col("v3"), F.col("v4"))))
+              .withColumn("v6", testUdf(F.col("v3")))
+              )
+
+        df.show()
+
+
+

From 6e9e734b982d8f4dd668350bc55c7e40559b9311 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 6 Apr 2023 03:36:15 -0700
Subject: [PATCH 6/8] wip

---
 dbldatagen/text_generatestring.py |  11 +-
 dbldatagen/text_generators.py     |   1 -
 dbldatagen/value_based_prng.py    | 246 +++++++++++++++++++++++++-----
 tests/test_text_generatestring.py |   5 +-
 tests/test_value_based_PRNG.py    |  70 ++++++---
 5 files changed, 269 insertions(+), 64 deletions(-)

diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py
index 82ad6e44..fdb561bd 100644
--- a/dbldatagen/text_generatestring.py
+++ b/dbldatagen/text_generatestring.py
@@ -12,10 +12,11 @@
 import numpy as np
 import pandas as pd
 
+import pyspark.sql.functions as F
+
 from .text_generators import TextGenerator
 from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL
 
-import pyspark.sql.functions as F
 
 class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
     """This class handles the generation of string text of specified length drawn from alphanumeric characters.
@@ -57,8 +58,8 @@ class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
     def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
         super().__init__()
 
-        assert not customChars or isinstance(customChars, list) or isinstance(customChars, str),  \
-               "`customChars` should be list of characters or string containing custom chars"
+        assert not customChars or isinstance(customChars, (list, str)), \
+            "`customChars` should be list of characters or string containing custom chars"
 
         assert not allUpper or not allLower, "allUpper and allLower cannot both be True"
 
@@ -129,7 +130,7 @@ def make_variable_length_mask(self, v, lengths):
         return (c_ix.T < lengths.T).T
 
     def mk_bounds(self, v, minLength, maxLength):
-        rng = default_rng(42)
+        rng = np.random.default_rng(42)
         v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1)
         return rng.integers(v_bounds) + minLength
 
@@ -163,7 +164,7 @@ def pandasGenerateText(self, v):
 
         placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_)
 
-        lengths = (v.to_numpy() % (self._maxLength - self._minLength) + self._minLength)
+        lengths = v.to_numpy() % (self._maxLength - self._minLength) + self._minLength
 
         v1 = np.full((v.shape[0], self._maxLength), -1)
 
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
index ae10644d..d0fd435f 100644
--- a/dbldatagen/text_generators.py
+++ b/dbldatagen/text_generators.py
@@ -172,7 +172,6 @@ def prepareBaseValue(self, baseDef):
         return baseDef
 
 
-
 class TemplateGenerator(TextGenerator):  # lgtm [py/missing-equals]
     """This class handles the generation of text from templates
 
diff --git a/dbldatagen/value_based_prng.py b/dbldatagen/value_based_prng.py
index c2e2293a..18039348 100644
--- a/dbldatagen/value_based_prng.py
+++ b/dbldatagen/value_based_prng.py
@@ -5,49 +5,79 @@
 """
 This module defines the ValueBasedPRNG class
 
-The Value Based PRNG (psuedo random number generator) uses the PCG algorithm  to generate repeatable
-psuedo random numbers. It achieves excellent statistical performance with small and fast code, and small state size.
+The Value Based PRNG (psuedo random number generator) uses variations of the PCG algorithm  to generate repeatable
+psuedo random numbers.
 
-More details of the PCG algorithnm can be found here.
+PCG operates by using classic LCG (Linear Congruential Generator) approaches to generate the random state
+but applies a transform to generate the final pseudo random numbers.
 
-https://www.pcg-random.org/index.html
+More details of the LCG and PCG algorithms can be found here.
+  - https://en.wikipedia.org/wiki/Linear_congruential_generator
+  - https://en.wikipedia.org/wiki/Permuted_congruential_generator
 
-The implementation used here, uses as a numpy array of values to seed different random seeds for each row
-so that generating pseudo random sequences across each row of a numpy 2d array are completely repeatable.
+
+The implementation uses as a numpy array of values to seed different random seeds for each row
+so that generating pseudo random sequences across each row of a numpy 2d array are completely repeatable and the
+sequences for a given row will depend on the value of the seed
+
+The goal here is to seed each stream with a seed based on the value passed in for a given row
+so that two rows with the same seed value will produce the same sequence.
 
 This differs from existing Numpy implementations as in existing Numpy random number generators,
 each row does not have a separate random seed.
 
-The goal of this implementation is to ensure that the sequence of psuedo-random numbers generated for a given row
-are independent of the value of the seed in other rows.
-
 In practice, this makes it ideal for generating repeatable foreign keys for a given primary key.
 
 """
 
+import math
 import numpy as np
 
+
 class ValueBasedPRNG(object):
     """ Value based psuedo-random number generator designed to generate repeatable random sequences.
 
-        It uses the supplied array `v` to seed each a set of states for generating random numbers in sequences of shape
-        where the first dimension is a multiple of the size of the seed array `v`.
+        It uses the supplied array `values` to seed each a set of states for generating random numbers in sequences
+        of shape where the first dimension is a multiple of the size of the seed array `v`.
 
         The overall goal is to provide a prng that is replacement for the numpy random number generator where only
         the `integers` method is used.
 
-        Note that by array-like, we mean a list, a numpy array or a numeric value
+        While internal state is of type `numpy.uint64`, emitted values are of type `numpy.uint32` by default.
+
+        Note that by array-like, we mean a list, a numpy array or a numeric scalar value
     """
 
-    _MASK32 = 2 ** 32 - 1  # uint32_t
-    _MASK64 = 2 ** 64 - 1  # uint64_t
+    MASK_32 = 2 ** 32 - 1
+    MASK_64 = 2 ** 64 - 1
+    MULTIPLIER = 0x5851f42d4c957f2d
+    INITIAL_STATE = 0x4d595df4d0f33173
+    INCREMENT = 0x14057b7ef767814f
 
-    def __init__(self, v, shape=None, dtype=None):
-        """
+    ROWS = 20000
+    COLUMNS = 50
+    BITS_FOR_COLUMNS = int(math.ceil(math.log(COLUMNS, 2)))
+
+    MULTIPLIER = 0x5DEECE66D
+    INITIAL_STATE = 0
+    INCREMENT = 11
+
+    # MULTIPLIER = 0x5851f42d4c957f2d
+    # INITIAL_STATE = 0x4d595df4d0f33173
+    # INCREMENT = 0x14057b7ef767814f
+
+    MAX_CYCLES = 25
 
-        :param v: array-like list of values to seed the number generator
-        :param shape: If supplied, indicates that generation should allow for generations of values of supplied shape
-        :param dtype: Numpy dtype for generation of random numbers
+    def __init__(self, values, shape=None, additionalSeed=None):
+        """ Initialize the PRNG
+
+        :param values: array-like list of values to seed the number generator. If values has second or additional
+                       dimension, seedValues are hash of each row. Must be scalar, 1d or 2d array-like value
+        :param shape: If supplied, indicates that generation should optimize for generations of values of supplied
+                      shape.
+                      If supplied, must be 1-d or 2-d shape and first dimension must match first dimension of values
+
+        param additionalSeed: if provided, the additional seed will be combined with base pcg seed. Must be of type int
 
         If a shape is supplied, it creates sufficient state to allow for generation of the number of values indicated
         by the shape parameter independently. In effect, each row has a separate random generator and each column will
@@ -58,35 +88,181 @@ def __init__(self, v, shape=None, dtype=None):
         If converting from string values, it expects that strings can be converted to ints, and if not
         will raise a ValueError.
         """
-        effective_values = np.array(v)  # if v is already numpy array, its a no-op
-
-        if effective_values.dtype not in [ np.int32, np.uint32, np.int64, np.uint64]:
-            if np.can_cast(effective_values, np.int64, casting="unsafe"):
-                effective_values = effective_values.astype(np.int64, casting="unsafe")
+        assert values is not None, "`values` must be supplied"
+
+        self._additionalSeed = additionalSeed
+
+        effective_values = np.array(values)  # if v is already numpy array, its a no-op
+        supplied_shape = effective_values.shape
+
+        if len(supplied_shape) > 2:
+            raise ValueError("`values` must be scalar, 1d or 2d array-like value")
+
+        if shape is not None and not isinstance(shape, tuple):
+            raise ValueError("`shape` must be tuple, if supplied")
+
+        self._output_shape = shape or effective_values.shape
+
+        print("shape and type", effective_values.shape, effective_values.dtype)
+        # reshape as needed
+        effective_values = self._reshapeAtLeast2d(effective_values)
+
+        # apply hash if needed
+        if len(supplied_shape) > 2 or (len(supplied_shape) == 2 and supplied_shape[1] > 1):
+            # use hash of each row as seed
+            print("hashing")
+            hashing_shape = supplied_shape
+            while len(hashing_shape) > 2 or (len(hashing_shape) == 2 and hashing_shape[1] > 1):
+                effective_values = self._hashSeedValues(effective_values)
+                hashing_shape = effective_values.shape
+        elif np.issubdtype(effective_values.dtype, np.str_):
+            print("converting strings")
+            effective_values = self._hashSeedValues(effective_values)
+        elif np.issubdtype(effective_values.dtype, np.object_):
+            print("hashing objects")
+            effective_values = self._hashSeedValues(effective_values)
+
+        # cast if needed to numpy.uint64
+        effective_values = self._convertTypeIfNecessary(effective_values)
+        effective_values = self._reshapeAtLeast2d(effective_values)
+
+        self._seed_values = effective_values
+
+        # compute initial state
+        columns = self._columns_from_shape(effective_values.shape)
+        bits_for_columns = int(math.ceil(math.log(columns, 2)))
+        rows = self._rows_from_shape(effective_values.shape)
+
+        column_increments = ((effective_values << bits_for_columns) |
+                             np.arange(1, columns + 1, dtype=np.uint64)) << 1 | 1
+
+        self._state = np.full((rows, columns), column_increments + self.INITIAL_STATE, dtype=np.uint64)
+        self._incr = np.full((rows, columns), column_increments, dtype=np.uint64)
+
+    def _reshapeAtLeast2d(self, arr):
+        """ Reshape array as 2d"""
+        if arr.shape == ():
+            return arr.reshape((1, 1))
+        elif len(arr.shape) == 1:
+            return arr.reshape((arr.shape[0], 1))
+        else:
+            return arr
+
+    def _hashSeedValues(self, values):
+        """ Hash and reshape 2d values """
+        print("hashing values", values)
+        values_shape = values.shape
+        results = np.apply_along_axis(lambda r: hash(tuple(r)), 1, values).astype(np.uint64)
+
+        return results
+
+    def _convertTypeIfNecessary(self, values):
+        """ Convert values array"""
+        results = values
+        # cast if needed to numpy.uint64
+        if values.dtype != np.uint64:
+            if np.can_cast(values, np.uint64, casting="unsafe"):
+                results = values.astype(np.uint64, casting="unsafe")
             else:
                 raise TypeError("Cant cast values to np.int64")
+        return results
+
+    @staticmethod
+    def _columns_from_shape(shape):
+        assert isinstance(shape, tuple), "expecting tuple for shape"
 
-        self._v = effective_values
-        self._dtype = dtype or effective_values.dtype
+        if len(shape) < 2:
+            return 1
+        return shape[1]
 
-        # scalars have a numpy shape of ()
-        self._shape = shape or effective_values.shape
-        self._internal_shape = self._shape
+    @staticmethod
+    def _rows_from_shape(shape):
+        assert isinstance(shape, tuple), "expecting tuple for shape"
 
-        # internally treat scalars as array of size 1
-        if self._v.shape == ():
-            self._internal_shape = (1,)
+        return shape[0]
+
+    def _init_state(self, values, shape):
+        pass
 
     @property
     def shape(self):
         """get the `shape` attribute"""
-        return self._shape
+        return self._output_shape
 
     @property
     def seedValues(self):
         """ Get the values that were used to seed the PRNG"""
-        return self._v
-        
+        return self._seed_values
+
+    def random_r(self, state, incr):
+        # compute state change using classic LCG algorithm
+        x = state[...]
+        state[...] = (x * self.MULTIPLIER + incr) & self.MASK_64
+
+        # then use transformation on state to generate actual random numbers
+        # 32 bit variation
+        # xorstate = (((x >> 18) ^ x) >> 27) & MASK_32
+        # count = (x >> 59) & MASK_32
+        # results = ((xorstate >> count) | (xorstate << (-count & 31))) & MASK_32
+
+        count = (x >> 59) & self.MASK_32
+        results = x >> (16 + count) & self.MASK_32
+        return results
+
+    def bounded_random(self, bound, state, incr):
+        """ Use rejection sampling to remove modulo bias
+
+        :param bound: random state
+        :param state: random state
+        :param incr: random increment
+        :return:
+
+        Modulo bias occurs when a set of numbers to be used in a modulus calculation is not an even
+        multiple of the divisor.
+
+        For example: for the set of numbers 0 .. 67, `n` mod 16 will return more numbers in the range 1 .. 3
+
+        So when using classic LCG random number generation `(aX + c) mod m`, the results will not be uniformly
+        distributed unless you discard numbers beyond a threshold of the highest before applying the modulo arithmetic.
+
+        This threshold can be calculated using the expresson ``(-bound & MASK_32) % bound`` for a 32 bit random number.
+
+        While the loop to find numbers over the threshold is usually short, we add a limit to the number of attempts
+        to retry the random number generation. This will produce slightly non-uniform distribution in rare cases
+
+        As the intent for this random number generator to compute random word and character offsets and other uses
+        where the bounds are relatively low, this is acceptable for our use case.
+
+        See:
+            "Efficiently Generating a Random Number in a Range" by Dr M.E O'Neill
+             https://www.pcg-random.org/posts/bounded-rands.html
+
+        """
+
+        threshold = (-bound & self.MASK_32) % bound
+
+        r = self.random_r(state, incr)
+        assert r is not None
+
+        mask = r >= threshold
+        r[mask] = r[mask] % bound
+        print(np.all(mask))
+
+        cycles = 0
+
+        while not np.all(mask) and cycles < self.MAX_CYCLES:
+            r1 = self.random_r(state, incr)
+            mask = r1 >= threshold
+            r[mask] = r1[mask] % bound
+            cycles = cycles + 1
+            print("cycle", cycles)
+
+        final_mask = r >= bound
+        if np.any(final_mask):
+            print("fixup")
+            r[final_mask] = r[final_mask] % bound
+
+        return r
 
     def integers(self, low, high=None, size=None, dtype=None, endpoint=False):
         """ Return psuedo-random integers from low (inclusive) to high (exclusive), or if endpoint=True,
diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py
index 8db08b5c..9d6fc599 100644
--- a/tests/test_text_generatestring.py
+++ b/tests/test_text_generatestring.py
@@ -78,7 +78,7 @@ def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, custom
                                  dg.GenerateString((10, 20), allLower=True),
                                  dg.GenerateString((1, 10)),
                                  dg.GenerateString((3, 15)),
-                                 dg.GenerateString((17,22)),
+                                 dg.GenerateString((17, 22)),
                                  dg.GenerateString((1, 10)),
                              ])
     def test_simple_data(self, genstr):
@@ -89,7 +89,7 @@ def test_simple_data(self, genstr):
                   .withColumn("code2", IntegerType(), min=0, max=10)
                   .withColumn("code3", StringType(), values=['a', 'b', 'c'])
                   .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
-                  .withColumn("code5", StringType(), text=dg.GenerateString( (1, 10) ))
+                  .withColumn("code5", StringType(), text=dg.GenerateString((1, 10)))
                   )
 
         fieldsFromGenerator = set(dgspec.getOutputColumnNames())
@@ -97,4 +97,3 @@ def test_simple_data(self, genstr):
         df_testdata = dgspec.build()
 
         df_testdata.show()
-
diff --git a/tests/test_value_based_PRNG.py b/tests/test_value_based_PRNG.py
index 164e0b8e..b314943a 100644
--- a/tests/test_value_based_PRNG.py
+++ b/tests/test_value_based_PRNG.py
@@ -5,13 +5,9 @@
 
 import pyspark.sql.functions as F
 
-import pyspark.sql.functions as F
-
-from pyspark.sql.types import BooleanType, DateType
 from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
 
 import dbldatagen as dg
-from dbldatagen import TemplateGenerator, TextGenerator
 
 # add the following if using pandas udfs
 #    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "1000") \
@@ -26,23 +22,26 @@ class TestValueBasedPRNG:
     def test_basics(self):
         arr = np.arange(100)
 
-        rng = dg.ValueBasedPRNG(arr, shape=(len(arr), 10) )
+        rng = dg.ValueBasedPRNG(arr, shape=(len(arr), 10))
 
         assert rng is not None
 
         assert rng.shape == (len(arr), 10)
 
-        assert rng.seedValues is not None and np.array_equal(rng.seedValues, arr)
+        print(rng.seedValues)
 
+        assert rng.seedValues is not None
+        print(rng.seedValues.shape, rng.seedValues.dtype)
+        assert np.array_equal(rng.seedValues.reshape(arr.shape), arr)
 
     @pytest.mark.parametrize("data, shape, bounds, endpoint",
                              [
                                  (np.arange(1024), (1024, 10), 255, False),
                                  (np.arange(1024), (1024, 15), np.arange(15) * 3 + 1, False),
-                                 (np.arange(1024), (1024, ), 255, False),
+                                 (np.arange(1024), (1024,), 255, False),
                                  (np.arange(1024), (1024, 10), 255, False),
                                  (np.arange(1024), (1024, 15), np.arange(15) * 3 + 1, False),
-                                 (1, (1, 5), [1,2,3,4,5], False),
+                                 (1, (1, 5), [1, 2, 3, 4, 5], False),
                                  (10, None, [1, 2, 3, 4, 5], False),
                              ])
     def test_integers(self, data, shape, bounds, endpoint):
@@ -52,20 +51,37 @@ def test_integers(self, data, shape, bounds, endpoint):
 
         bounds_arr = np.full(shape, bounds)
 
-        results = rng.integers(bounds_arr, endpoint=endpoint)
+        # make sure that we have results that are always simply the passed in bounds
+        # they can be occasionally equal but not always equal
+
+        cumulative_equality = bounds_arr >= 0  # will be array of boolean values all true
 
-        assert results is not None , "results should be not None"
-        assert results.shape == bounds_arr.shape, "results should be of target shape"
+        for ix in range(10):
+            results = rng.integers(bounds_arr, endpoint=endpoint)
+
+            assert results is not None, "results should be not None"
+            assert results.shape == bounds_arr.shape, "results should be of target shape"
+
+            results_equality = results == bounds_arr
+            cumulative_equality = cumulative_equality and results_equality
+            print(cumulative_equality)
 
-        # make sure that we have results that are always simply the passed in bounds
         assert not np.array_equal(results, bounds_arr)
 
     @pytest.mark.parametrize("data, shape",
                              [
-                                 (23, (1024, 10) ),
-                                 (23.10, (1024, 15) ),
+                                 (23, (1024, 10)),
+                                 (23.10, (1024, 15)),
                                  (23, None),
                                  (23.10, None),
+                                 ([[1, 2, 3], [1, 2, 3]], None),
+                                 ([[1, "two", 3], [1, 2, "three"]], None),
+                                 ("test", None),
+                                 (["test", "test2"], None),
+                                 ([True, False, True], None),
+                                 ((1, 2, 3), None),
+                                 (np.datetime64('2005-10-24'), None)
+
                              ])
     def test_initialization_success(self, data, shape):
 
@@ -74,10 +90,15 @@ def test_initialization_success(self, data, shape):
 
         assert rng is not None
 
+        print(rng.seedValues.shape, rng.seedValues.dtype, rng.seedValues)
+
     @pytest.mark.parametrize("data, shape",
                              [
-                                 ("test", (1024, )),
-                                 ("test", None),
+                                 ({'a': 1, 'b': 2}, 25),  # bad shape
+                                 ( 34, 45),               # bad shape
+                                 ([[[1, 2, 3], [4, 5, 6]],
+                                   [[1, 2, 3], [4, 5, 6]],
+                                   [[1, 2, 3], [4, 5, 6]]], None),  # bad value - dimensions
                              ])
     def test_initialization_fail_value(self, data, shape):
 
@@ -87,12 +108,24 @@ def test_initialization_fail_value(self, data, shape):
 
             assert rng is not None
 
+    @pytest.mark.parametrize("data, shape",
+                             [
+                                 ({'a': 1, 'b': 2}, (1024,)),
+                             ])
+    def test_initialization_fail_type(self, data, shape):
+
+        with pytest.raises(TypeError):
+            arr = np.array(data)
+            rng = dg.ValueBasedPRNG(arr, shape=shape)
 
+            assert rng is not None
+
+    @pytest.mark.skip(reason="no way of currently testing this")
     def test_udfs(self):
 
         def exampleUdf(v):
             v1 = pd.DataFrame(v)
-            mk_str_fn = lambda x: str(hash(tuple(x)))  #str(x)
+            mk_str_fn = lambda x: str(hash(tuple(x)))  # str(x)
             results = v1.apply(mk_str_fn, axis=1)
             return pd.Series(results)
 
@@ -108,6 +141,3 @@ def exampleUdf(v):
               )
 
         df.show()
-
-
-

From ef916a3746c916698f01c49c86ecf5ba97161fcf Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 6 Apr 2023 04:15:20 -0700
Subject: [PATCH 7/8] wip

---
 tests/test_value_based_PRNG.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_value_based_PRNG.py b/tests/test_value_based_PRNG.py
index b314943a..d34ba84b 100644
--- a/tests/test_value_based_PRNG.py
+++ b/tests/test_value_based_PRNG.py
@@ -34,6 +34,7 @@ def test_basics(self):
         print(rng.seedValues.shape, rng.seedValues.dtype)
         assert np.array_equal(rng.seedValues.reshape(arr.shape), arr)
 
+    @pytest.mark.skip(reason="work in progress")
     @pytest.mark.parametrize("data, shape, bounds, endpoint",
                              [
                                  (np.arange(1024), (1024, 10), 255, False),
@@ -120,7 +121,7 @@ def test_initialization_fail_type(self, data, shape):
 
             assert rng is not None
 
-    @pytest.mark.skip(reason="no way of currently testing this")
+    @pytest.mark.skip(reason="work in progress")
     def test_udfs(self):
 
         def exampleUdf(v):

From 51df13b935540222aa789b7eab65b799be12cf4d Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 6 Apr 2023 11:02:05 -0700
Subject: [PATCH 8/8] wip

---
 dbldatagen/value_based_prng.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbldatagen/value_based_prng.py b/dbldatagen/value_based_prng.py
index 18039348..a7b65e86 100644
--- a/dbldatagen/value_based_prng.py
+++ b/dbldatagen/value_based_prng.py
@@ -50,6 +50,8 @@ class ValueBasedPRNG(object):
 
     MASK_32 = 2 ** 32 - 1
     MASK_64 = 2 ** 64 - 1
+
+    # constants for PCG
     MULTIPLIER = 0x5851f42d4c957f2d
     INITIAL_STATE = 0x4d595df4d0f33173
     INCREMENT = 0x14057b7ef767814f
@@ -58,6 +60,7 @@ class ValueBasedPRNG(object):
     COLUMNS = 50
     BITS_FOR_COLUMNS = int(math.ceil(math.log(COLUMNS, 2)))
 
+    # constants for Java style LCG
     MULTIPLIER = 0x5DEECE66D
     INITIAL_STATE = 0
     INCREMENT = 11
@@ -181,9 +184,6 @@ def _rows_from_shape(shape):
 
         return shape[0]
 
-    def _init_state(self, values, shape):
-        pass
-
     @property
     def shape(self):
         """get the `shape` attribute"""