Refactor vocab pipeline to use MLTransform ComputeAndApplyVocabulary

aIbrahiim · aIbrahiim · commit 66cfd4ce0d49 · 2026-04-30T20:26:16.000+03:00
diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt
@@ -26,6 +26,7 @@
 --autoscaling_algorithm=THROUGHPUT_BASED
 --worker_zone=us-central1-b
 --sdk_location=container
+--requirements_file=apache_beam/ml/transforms/mltransform_tests_requirements.txt
 --input_options={}
 --publish_to_big_query=true
 --metrics_dataset=beam_run_inference
diff --git a/sdks/python/apache_beam/ml/transforms/mltransform_tests_requirements.txt b/sdks/python/apache_beam/ml/transforms/mltransform_tests_requirements.txt
@@ -14,8 +14,17 @@
 # limitations under the License.
 #
 
-# Keep this benchmark requirements minimal. The vocab benchmark implementation
-# does not depend on TensorFlow/TensorFlow Transform, and those packages can
-# force incompatible apache-beam constraints during CI resolution.
+# Keep this benchmark requirements focused and deterministic for Dataflow
+# workers. MLTransform TFT operations require a consistent TensorFlow Transform
+# dependency set; otherwise workers can crash-loop with pandas/numpy ABI
+# mismatches during SDK harness startup.
 google-cloud-monitoring>=2.27.0
+tensorflow_transform>=1.14.0,<1.15.0
+tensorflow-metadata>=1.14.0,<1.15.0
+tfx-bsl>=1.14.0,<1.15.0
+# tfx-bsl / tensorflow-transform rely on pandas 1.x with numpy 1.x.
+numpy<2
+pandas<2
+# tensorflow-transform expects dill but does not hard-pin it.
+dill
 
diff --git a/sdks/python/apache_beam/testing/benchmarks/inference/mltransform_generate_vocab_benchmark.py b/sdks/python/apache_beam/testing/benchmarks/inference/mltransform_generate_vocab_benchmark.py
@@ -39,6 +39,7 @@ def test(self):
     extra_opts = {
         'input_file': self.pipeline.get_option('input_file'),
         'output_vocab': self.pipeline.get_option('output_vocab'),
+        'artifact_location': self.pipeline.get_option('artifact_location'),
         'columns': self.pipeline.get_option('columns'),
         'vocab_size': self.pipeline.get_option('vocab_size'),
         'min_frequency': self.pipeline.get_option('min_frequency'),