# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "sensevoice" platform: "ensemble" max_batch_size: 16 input [ { name: "WAV" data_type: TYPE_FP32 dims: [-1] }, { name: "WAV_LENS" data_type: TYPE_INT32 dims: [1] }, { name: "LANGUAGE" data_type: TYPE_INT32 dims: [1] }, { name: "TEXT_NORM" data_type: TYPE_INT32 dims: [1] } ] output [ { name: "TRANSCRIPTS" data_type: TYPE_STRING dims: [1] } ] ensemble_scheduling { step [ { model_name: "feature_extractor" model_version: -1 input_map { key: "wav" value: "WAV" } input_map { key: "wav_lens" value: "WAV_LENS" } output_map { key: "speech" value: "SPEECH" } output_map { key: "speech_lengths" value: "SPEECH_LENGTHS" } }, { model_name: "encoder" model_version: -1 input_map { key: "speech" value: "SPEECH" } input_map { key: "speech_lengths" value: "SPEECH_LENGTHS" } input_map { key: "language" value: "LANGUAGE" } input_map { key: "textnorm" value: "TEXT_NORM" } output_map { key: "ctc_logits" value: "ctc_logits" } output_map { key: "encoder_out_lens" value: "encoder_out_lens" } }, { model_name: "scoring" model_version: -1 input_map { key: "ctc_logits" value: "ctc_logits" } input_map { key: "encoder_out_lens" value: "encoder_out_lens" } output_map { key: "OUTPUT0" value: "TRANSCRIPTS" } } ] }