Merge pull request #3535 from AI-Hypercomputer:aireen/tiktoken

Google-ML-Automation · Google-ML-Automation · commit 4a0b8cbe64ac · 2026-03-31T19:02:40.000-07:00
PiperOrigin-RevId: 892656253
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -597,8 +597,7 @@ num_vocab_tiling: 1
 # Tokenizer
 vocab_size: 32_000 # powers of 2 for sharding
 tokenizer_path: ""
-# tfds pipeline supports tokenizer_type: sentencepiece, huggingface, tiktoken
-# grain pipeline supports tokenizer_type: sentencepiece, huggingface
+# grain and tfds pipeline supports tokenizer_type: sentencepiece, huggingface, tiktoken
 # hf pipeline only supports huggingface type, and will ignore tokenizer_type flag
 tokenizer_type: "sentencepiece" # Currently supporting: "tiktoken", "sentencepiece", "huggingface"
 use_chat_template: False
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -2580,13 +2580,6 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         raise ValueError("When dataset_type=grain, please set grain_train_files or grain_train_mixture_config_path")
       if self.eval_interval > 0 and not self.grain_eval_files:
         raise ValueError("Please specify grain_eval_files or set eval_interval to <=0.")
-      if self.tokenizer_type not in (
-          TokenizerType.SENTENCEPIECE,
-          TokenizerType.HUGGINGFACE,
-      ):
-        raise ValueError(
-            f"grain pipeline only supports tokenizer_type: sentencepiece, huggingface, but got {self.tokenizer_type}"
-        )
     elif self.dataset_type == DatasetType.TFDS:
       if not self.dataset_name:
         raise ValueError("dataset_name can't be empty when dataset_type=tfds")
diff --git a/src/maxtext/input_pipeline/grain_tokenizer.py b/src/maxtext/input_pipeline/grain_tokenizer.py
@@ -30,7 +30,7 @@ class TokenizerTransformBase:
   # pylint: disable=attribute-defined-outside-init
   feature_names: str | Sequence[str]
   sequence_length: int | Sequence[int]
-  tokenizer: tokenizer.SentencePieceTokenizer | tokenizer.HFTokenizer
+  tokenizer: tokenizer.SentencePieceTokenizer | tokenizer.HFTokenizer | tokenizer.TikTokenTokenizer
 
   def __post_init__(self):
     self._processor = None
diff --git a/tests/unit/grain_data_processing_test.py b/tests/unit/grain_data_processing_test.py
@@ -251,6 +251,45 @@ def test_for_loop_repeatable(self):
     super().test_for_loop_repeatable()
 
 
+class GrainArrayRecordTiktokenTest(GrainArrayRecordProcessingTest):
+  """Test grain data processing with best_fit packing strategy."""
+
+  def setUp(self):
+    super().setUp()
+    self.config = self._make_config(
+        tokenizer_type="tiktoken",
+        tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizers", "tokenizer_llama3.tiktoken"),
+    )
+    self.train_iter = grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
+
+  # Only runs test_train_ds from parent class, skip other tests
+  @pytest.mark.skip(reason="skip for tokenizer testing")
+  def test_batch_determinism(self):
+    pass
+
+  @pytest.mark.skip(reason="skip for tokenizer testing")
+  def test_for_loop_repeatable(self):
+    pass
+
+
+class GrainArrayRecordHFTokenizerTest(GrainArrayRecordProcessingTest):
+  """Test grain data processing with best_fit packing strategy."""
+
+  def setUp(self):
+    super().setUp()
+    self.config = self._make_config(tokenizer_type="huggingface", tokenizer_path="deepseek-ai/DeepSeek-V3")
+    self.train_iter = grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
+
+  # Only runs test_train_ds from parent class, skip other tests
+  @pytest.mark.skip(reason="skip for tokenizer testing")
+  def test_batch_determinism(self):
+    pass
+
+  @pytest.mark.skip(reason="skip for tokenizer testing")
+  def test_for_loop_repeatable(self):
+    pass
+
+
 class GrainArrayRecordBestFitPackingTest(GrainArrayRecordProcessingTest):
   """Test grain data processing with best_fit packing strategy."""