@@ -109,6 +109,31 @@ def _validate(self) -> None:
109
109
super ()._validate ()
110
110
Assert .in_range (self .rank , 0 , self .world_size )
111
111
112
+ @config_class
113
+ class FieldCombinePreparatorConfig (Config ):
114
+ fields : list = Field (
115
+ default = [],
116
+ desc = "Fields of the dataset to combine." ,
117
+ hint = FieldHint .core ,
118
+ )
119
+ delimiter : str = Field (
120
+ default = " " ,
121
+ desc = "Delimiter to use when combining fields." ,
122
+ hint = FieldHint .optional ,
123
+ )
124
+ new_field_name : str = Field (
125
+ default = "fast_llm_combined_field" ,
126
+ desc = "Name of the new field to create." ,
127
+ hint = FieldHint .optional ,
128
+ )
129
+
130
+ def _validate (self ) -> None :
131
+ Assert .gt (len (self .fields ), 0 )
132
+ # assert isinstance(self.fields, list), "Fields must be a list."
133
+ # assert all(isinstance(field, str) for field in self.fields), "All fields must be strings."
134
+ assert isinstance (self .delimiter , str ), "Delimiter must be a string."
135
+ # assert isinstance(self.new_field_name, str), "New field name must be a string."
136
+ super ()._validate ()
112
137
113
138
@config_class ()
114
139
class GPTMemmapDatasetPreparatorConfig (DatasetPreparatorConfig ):
@@ -164,6 +189,11 @@ class GPTMemmapDatasetPreparatorConfig(DatasetPreparatorConfig):
164
189
" Does not shuffle samples." ,
165
190
hint = FieldHint .optional ,
166
191
)
192
+ combine_fields : FieldCombinePreparatorConfig = Field (
193
+ default = None ,
194
+ desc = "Combine all files into a single file." ,
195
+ hint = FieldHint .optional ,
196
+ )
167
197
168
198
def _validate (self ) -> None :
169
199
assert self .tokenizer .path is not None
0 commit comments