forked from NVIDIA-NeMo/Curator
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipeline.yaml
More file actions
79 lines (61 loc) · 2.23 KB
/
pipeline.yaml
File metadata and controls
79 lines (61 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
defaults:
- _self_
- override hydra/job_logging: none
- override hydra/hydra_logging: none
hydra:
run:
dir: .
output_subdir: null
documentation: |
FLEURS
######
This config can be used to prepare
`FLEURS <https://huggingface.co/datasets/google/fleurs>`_
dataset in the NeMo format.
It produces manifest for dev split of armenian language.
This config performs the following data processing.
1. Downloads FLEURS data
2. Calculates the length of wav files
**Required arguments**.
* **raw_data_dir**: specify the workspace folder where all audio files will be stored.
Note that you can customize any part of this config either directly or from command-line.
**Output format**
This config generates output manifest files:
* ``${raw_data_dir}/result`` - dev subset of the data.
Output manifest contains the following keys:
* **audio_filepath (str)**: relative path to the audio files.
* **text (str)**: transcription (lower-case without punctuation).
* **duration (float)**: audio duration in seconds.
processors_to_run: all
raw_data_dir: ???
data_split: dev
output_dir: ${raw_data_dir}/result
backend: xenna
processors:
# Getting data for Armenian dev set
- _target_: nemo_curator.stages.audio.datasets.fleurs.create_initial_manifest.CreateInitialManifestFleursStage
lang: "hy_am"
split: ${data_split}
raw_data_dir: ${raw_data_dir}
batch_size: 4
- _target_: nemo_curator.stages.audio.inference.asr_nemo.InferenceAsrNemoStage
model_name: nvidia/stt_hy_fastconformer_hybrid_large_pc
resources:
_target_: nemo_curator.stages.resources.Resources
gpus: 1.0
- _target_: nemo_curator.stages.audio.metrics.get_wer.GetPairwiseWerStage
text_key: "text"
pred_text_key: "pred_text"
wer_key: "wer"
- _target_: nemo_curator.stages.audio.common.GetAudioDurationStage
audio_filepath_key: audio_filepath
duration_key: duration
- _target_: nemo_curator.stages.audio.common.PreserveByValueStage
input_value_key: "wer"
target_value: 75.
operator: "le"
- _target_: nemo_curator.stages.audio.io.convert.AudioToDocumentStage
- _target_: nemo_curator.stages.text.io.writer.JsonlWriter
path: ${output_dir}
write_kwargs:
"force_ascii": False