apache
diff --git a/‎.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/beam_Inference_Python_Benchmarks_Dataflow.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt‎
Lines changed: 44 additions & 0 deletions b/‎.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎.test-infra/tools/refresh_looker_metrics.py‎
Lines changed: 2 additions & 1 deletion b/‎.test-infra/tools/refresh_looker_metrics.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sdks/python/apache_beam/examples/ml_transform/README.md‎
Lines changed: 130 additions & 0 deletions b/‎sdks/python/apache_beam/examples/ml_transform/README.md‎
Lines changed: 130 additions & 0 deletions
@@ -94,6 +94,7 @@ jobs:
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Table_Row_Inference_Batch.txt
             ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_Table_Row_Inference_Stream.txt
+            ${{ github.workspace }}/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_MLTransform_Generate_Vocab_Batch.txt
       # The env variables are created and populated in the test-arguments-action as "<github.job>_test_arguments_<argument_file_paths_index>"
       - name: get current time
         run: echo "NOW_UTC=$(date '+%m%d%H%M%S' --utc)" >> $GITHUB_ENV
@@ -214,3 +215,14 @@ jobs:
             -PpythonVersion=3.10 \
             -PloadTest.requirementsTxtFile=apache_beam/ml/inference/table_row_inference_requirements.txt \
             '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_10 }} --autoscaling_algorithm=THROUGHPUT_BASED --max_num_workers=20 --metrics_table=result_table_row_inference_stream --influx_measurement=result_table_row_inference_stream --mode=streaming --input_subscription=projects/apache-beam-testing/subscriptions/table_row_inference_benchmark --window_size_sec=60 --trigger_interval_sec=30 --timeout_ms=900000 --output_table=apache-beam-testing:beam_run_inference.result_table_row_inference_stream_outputs --job_name=benchmark-tests-table-row-inference-stream-${{env.NOW_UTC}}'
+      - name: run MLTransform Generate Vocab Batch
+        uses: ./.github/actions/gradle-command-self-hosted-action
+        timeout-minutes: 180
+        with:
+          gradle-command: :sdks:python:apache_beam:testing:load_tests:run
+          arguments: |
+            -PloadTest.mainClass=apache_beam.testing.benchmarks.inference.mltransform_generate_vocab_benchmark \
+            -Prunner=DataflowRunner \
+            -PpythonVersion=3.10 \
+            -PloadTest.requirementsTxtFile=apache_beam/ml/transforms/mltransform_tests_requirements.txt \
+            '-PloadTest.args=${{ env.beam_Inference_Python_Benchmarks_Dataflow_test_arguments_11 }} --job_name=benchmark-tests-mltransform-generate-vocab-batch-${{env.NOW_UTC}}'
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+--project=apache-beam-testing
+--region=us-central1
+--runner=DataflowRunner
+--temp_location=gs://temp-storage-for-perf-tests/loadtests
+--staging_location=gs://temp-storage-for-perf-tests/loadtests
+--machine_type=n1-standard-4
+--disk_size_gb=100
+--num_workers=8
+--max_num_workers=16
+--autoscaling_algorithm=THROUGHPUT_BASED
+--worker_zone=us-central1-b
+--sdk_location=container
+--requirements_file=apache_beam/ml/transforms/mltransform_tests_requirements.txt
+--input_options={}
+--publish_to_big_query=true
+--metrics_dataset=beam_run_inference
+--metrics_table=mltransform_generate_vocab_batch
+--influx_measurement=mltransform_generate_vocab_batch
+--input_file=gs://apache-beam-ml/testing/inputs/sentences_50k.txt
+--output_vocab=gs://temp-storage-for-perf-tests/mltransform/vocab_outputs/mltransform_generate_vocab_batch
+--columns=text
+--vocab_size=50000
+--min_frequency=1
+--lowercase=true
+--tokenizer=whitespace
+--oov_token=<UNK>
+--input_expand_factor=1
+
@@ -44,7 +44,8 @@
     ("85", ["268", "269", "270", "271", "272"]),  # PyTorch Sentiment Batch DistilBERT base uncased
     ("86", ["284", "285", "286", "287", "288"]),  # VLLM Batch Gemma
     ("96", ["270", "304", "305", "353", "354"]),   # Table Row Inference Sklearn Batch
-    ("106", ["355", "356", "357", "358", "359"])   # Table Row Inference Sklearn Streaming
+    ("106", ["355", "356", "357", "358", "359"]),   # Table Row Inference Sklearn Streaming
+    ("107", ["360", "361", "362", "363", "364"]),  # MLTransform Generate Vocab Batch
 ]
 
 def get_look(id: str) -> models.Look:
 
@@ -0,0 +1,130 @@
+<!--
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# MLTransform Examples
+
+This directory contains Apache Beam examples for MLTransform pipelines.
+
+## MLTransform - Generate Vocab (Batch only)
+
+`mltransform_generate_vocab.py` builds a vocabulary artifact from batch input
+rows using `MLTransform` + `ComputeAndApplyVocabulary`.
+
+### What it does
+
+1. Reads input rows from JSONL (`--input_file`) or BigQuery (`--input_table`).
+2. Extracts specified columns (`--columns`).
+3. Normalizes text (`trim`, optional lowercasing).
+4. Tokenizes text (`whitespace` or `regex` tokenizer).
+5. Runs `ComputeAndApplyVocabulary` with top-k and min-frequency constraints.
+6. Ensures `--oov_token` is included first.
+7. Writes the vocabulary as one token per line.
+
+### Required arguments
+
+- `--output_vocab`
+- `--columns`
+- and one of:
+  - `--input_file`
+  - `--input_table`
+
+### Optional arguments
+
+- `--vocab_size` (default: `50000`)
+- `--min_frequency` (default: `1`)
+- `--lowercase` (default: `true`)
+- `--tokenizer` (`whitespace` or `regex`, default: `whitespace`)
+- `--oov_token` (default: `<UNK>`)
+- `--input_expand_factor` (default: `1`, useful for perf/load testing)
+
+### Local batch example
+
+```sh
+python -m apache_beam.examples.ml_transform.mltransform_generate_vocab \
+  --input_file=/tmp/input.jsonl \
+  --output_vocab=/tmp/vocab.txt \
+  --columns=text,category \
+  --vocab_size=5 \
+  --min_frequency=1 \
+  --lowercase=true \
+  --tokenizer=whitespace \
+  --oov_token=<UNK> \
+  --input_expand_factor=1 \
+  --runner=DirectRunner
+```
+
+### Input format
+
+JSONL input with object rows, for example:
+
+```json
+{"id":"1","text":"Beam beam ML pipeline"}
+{"id":"2","text":"Beam pipeline dataflow"}
+{"id":"3","text":"ML transform beam"}
+{"id":"4","text":"vocab vocab vocab test"}
+{"id":"5","text":"rare_token_once"}
+{"id":"6","text":""}
+{"id":"7","text":null}
+```
+
+The integration tests in `mltransform_generate_vocab_test.py` generate this
+sample data programmatically.
+
+### Output format
+
+One token per line:
+
+1. `oov_token` first
+2. remaining tokens follow the vocabulary order produced by
+   `ComputeAndApplyVocabulary`.
+
+Example output:
+
+```txt
+<UNK>
+beam
+ml
+```
+
+For this sample and config:
+
+```sh
+--columns=text --min_frequency=2 --vocab_size=3
+```
+
+the expected output is:
+
+```txt
+<UNK>
+beam
+vocab
+ml
+```
+
+### Empty vocabulary behavior
+
+If all tokens are filtered out by `--min_frequency`, the pipeline writes only
+the reserved `--oov_token` and logs a warning.
+
+### Additional test datasets
+
+Test data for happy path and null/empty/missing columns is generated inline in
+`mltransform_generate_vocab_test.py`.
+
+### Performance testing pattern
+
+- Small local files: functional correctness and output-stability tests.
+- Large GCS files (or moderate file + `--input_expand_factor`): throughput/cost
+  benchmarking on Dataflow.
+
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,8 @@`
`44`	`44`	`("85", ["268", "269", "270", "271", "272"]), # PyTorch Sentiment Batch DistilBERT base uncased`
`45`	`45`	`("86", ["284", "285", "286", "287", "288"]), # VLLM Batch Gemma`
`46`	`46`	`("96", ["270", "304", "305", "353", "354"]), # Table Row Inference Sklearn Batch`
`47`		`- ("106", ["355", "356", "357", "358", "359"]) # Table Row Inference Sklearn Streaming`
	`47`	`+ ("106", ["355", "356", "357", "358", "359"]), # Table Row Inference Sklearn Streaming`
	`48`	`+ ("107", ["360", "361", "362", "363", "364"]), # MLTransform Generate Vocab Batch`
`48`	`49`	`]`
`49`	`50`
`50`	`51`	`def get_look(id: str) -> models.Look:`