|
6 | 6 | from tqdm import tqdm |
7 | 7 | import sys |
8 | 8 | import os |
9 | | -from modelscope.pipelines import pipeline |
10 | | -from modelscope.utils.constant import Tasks |
11 | 9 |
|
12 | 10 |
|
13 | | -from modelscope.hub.snapshot_download import snapshot_download |
14 | | - |
15 | 11 |
|
16 | 12 | from common.constants import Languages |
17 | 13 | from common.log import logger |
18 | 14 | from common.stdout_wrapper import SAFE_STDOUT |
19 | 15 |
|
20 | 16 | import re |
21 | 17 |
|
22 | | -# 指定本地目录 |
23 | | -local_dir_root = "./models_from_modelscope" |
24 | | -model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', cache_dir=local_dir_root) |
25 | | -model_dir_punc_ct = snapshot_download('damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch', cache_dir=local_dir_root) |
26 | | -model_dir_vad = snapshot_download('damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', cache_dir=local_dir_root) |
27 | | - |
28 | | -model_dir_ja = snapshot_download('damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline', cache_dir=local_dir_root) |
29 | | - |
30 | | - |
31 | | -model_dir_en = snapshot_download('damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline', cache_dir=local_dir_root) |
32 | 18 |
|
33 | 19 | device = "cuda:0" if torch.cuda.is_available() else "cpu" |
34 | 20 |
|
35 | 21 |
|
36 | | - |
37 | | -inference_pipeline = pipeline( |
38 | | - task=Tasks.auto_speech_recognition, |
39 | | - model=model_dir, |
40 | | - vad_model=model_dir_vad, |
41 | | - punc_model=model_dir_punc_ct, |
42 | | - #lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch', |
43 | | - #lm_weight=0.15, |
44 | | - #beam_size=10, |
45 | | -) |
46 | | -param_dict = {} |
47 | | -param_dict['use_timestamp'] = False |
48 | | -# folderpath = sys.argv[1] |
49 | | -extensions = ['wav'] |
50 | | - |
51 | | - |
52 | | - |
53 | | -inference_pipeline_ja = pipeline( |
54 | | - task=Tasks.auto_speech_recognition, |
55 | | - model=model_dir_ja, |
56 | | - # vad_model=model_dir_vad, |
57 | | - # punc_model=model_dir_punc_ct, |
58 | | - #lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch', |
59 | | - #lm_weight=0.15, |
60 | | - #beam_size=10, |
61 | | -) |
62 | | - |
63 | | - |
64 | | -inference_pipeline_en = pipeline( |
65 | | - task=Tasks.auto_speech_recognition, |
66 | | - model=model_dir_en, |
67 | | - # vad_model=model_dir_vad, |
68 | | - # punc_model=model_dir_punc_ct, |
69 | | - #lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch', |
70 | | - #lm_weight=0.15, |
71 | | - #beam_size=10, |
72 | | -) |
73 | | - |
74 | | - |
75 | | -model = whisper.load_model("medium",download_root="./whisper_model/") |
76 | | - |
77 | | - |
| 22 | +from funasr import AutoModel |
| 23 | + |
| 24 | +model_dir = "iic/SenseVoiceSmall" |
| 25 | + |
| 26 | + |
| 27 | +emo_dict = { |
| 28 | + "<|HAPPY|>": "😊", |
| 29 | + "<|SAD|>": "😔", |
| 30 | + "<|ANGRY|>": "😡", |
| 31 | + "<|NEUTRAL|>": "", |
| 32 | + "<|FEARFUL|>": "😰", |
| 33 | + "<|DISGUSTED|>": "🤢", |
| 34 | + "<|SURPRISED|>": "😮", |
| 35 | +} |
| 36 | + |
| 37 | +event_dict = { |
| 38 | + "<|BGM|>": "🎼", |
| 39 | + "<|Speech|>": "", |
| 40 | + "<|Applause|>": "👏", |
| 41 | + "<|Laughter|>": "😀", |
| 42 | + "<|Cry|>": "😭", |
| 43 | + "<|Sneeze|>": "🤧", |
| 44 | + "<|Breath|>": "", |
| 45 | + "<|Cough|>": "🤧", |
| 46 | +} |
| 47 | + |
| 48 | +emoji_dict = { |
| 49 | + "<|nospeech|><|Event_UNK|>": "❓", |
| 50 | + "<|zh|>": "", |
| 51 | + "<|en|>": "", |
| 52 | + "<|yue|>": "", |
| 53 | + "<|ja|>": "", |
| 54 | + "<|ko|>": "", |
| 55 | + "<|nospeech|>": "", |
| 56 | + "<|HAPPY|>": "😊", |
| 57 | + "<|SAD|>": "😔", |
| 58 | + "<|ANGRY|>": "😡", |
| 59 | + "<|NEUTRAL|>": "", |
| 60 | + "<|BGM|>": "🎼", |
| 61 | + "<|Speech|>": "", |
| 62 | + "<|Applause|>": "👏", |
| 63 | + "<|Laughter|>": "😀", |
| 64 | + "<|FEARFUL|>": "😰", |
| 65 | + "<|DISGUSTED|>": "🤢", |
| 66 | + "<|SURPRISED|>": "😮", |
| 67 | + "<|Cry|>": "😭", |
| 68 | + "<|EMO_UNKNOWN|>": "", |
| 69 | + "<|Sneeze|>": "🤧", |
| 70 | + "<|Breath|>": "", |
| 71 | + "<|Cough|>": "😷", |
| 72 | + "<|Sing|>": "", |
| 73 | + "<|Speech_Noise|>": "", |
| 74 | + "<|withitn|>": "", |
| 75 | + "<|woitn|>": "", |
| 76 | + "<|GBG|>": "", |
| 77 | + "<|Event_UNK|>": "", |
| 78 | +} |
| 79 | + |
| 80 | +lang_dict = { |
| 81 | + "<|zh|>": "<|lang|>", |
| 82 | + "<|en|>": "<|lang|>", |
| 83 | + "<|yue|>": "<|lang|>", |
| 84 | + "<|ja|>": "<|lang|>", |
| 85 | + "<|ko|>": "<|lang|>", |
| 86 | + "<|nospeech|>": "<|lang|>", |
| 87 | +} |
| 88 | + |
| 89 | +emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"} |
| 90 | +event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",} |
78 | 91 |
|
79 | 92 | lang2token = { |
80 | 93 | 'zh': "ZH|", |
81 | 94 | 'ja': "JP|", |
82 | 95 | "en": "EN|", |
| 96 | + "ko": "KO|", |
| 97 | + "yue": "YUE|", |
83 | 98 | } |
84 | 99 |
|
| 100 | +def format_str(s): |
| 101 | + for sptk in emoji_dict: |
| 102 | + s = s.replace(sptk, emoji_dict[sptk]) |
| 103 | + return s |
| 104 | + |
| 105 | + |
| 106 | +def format_str_v2(s): |
| 107 | + sptk_dict = {} |
| 108 | + for sptk in emoji_dict: |
| 109 | + sptk_dict[sptk] = s.count(sptk) |
| 110 | + s = s.replace(sptk, "") |
| 111 | + emo = "<|NEUTRAL|>" |
| 112 | + for e in emo_dict: |
| 113 | + if sptk_dict[e] > sptk_dict[emo]: |
| 114 | + emo = e |
| 115 | + for e in event_dict: |
| 116 | + if sptk_dict[e] > 0: |
| 117 | + s = event_dict[e] + s |
| 118 | + s = s + emo_dict[emo] |
| 119 | + |
| 120 | + for emoji in emo_set.union(event_set): |
| 121 | + s = s.replace(" " + emoji, emoji) |
| 122 | + s = s.replace(emoji + " ", emoji) |
| 123 | + return s.strip() |
| 124 | + |
| 125 | +def format_str_v3(s): |
| 126 | + def get_emo(s): |
| 127 | + return s[-1] if s[-1] in emo_set else None |
| 128 | + def get_event(s): |
| 129 | + return s[0] if s[0] in event_set else None |
| 130 | + |
| 131 | + s = s.replace("<|nospeech|><|Event_UNK|>", "❓") |
| 132 | + for lang in lang_dict: |
| 133 | + s = s.replace(lang, "<|lang|>") |
| 134 | + s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] |
| 135 | + new_s = " " + s_list[0] |
| 136 | + cur_ent_event = get_event(new_s) |
| 137 | + for i in range(1, len(s_list)): |
| 138 | + if len(s_list[i]) == 0: |
| 139 | + continue |
| 140 | + if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: |
| 141 | + s_list[i] = s_list[i][1:] |
| 142 | + #else: |
| 143 | + cur_ent_event = get_event(s_list[i]) |
| 144 | + if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): |
| 145 | + new_s = new_s[:-1] |
| 146 | + new_s += s_list[i].strip().lstrip() |
| 147 | + new_s = new_s.replace("The.", " ") |
| 148 | + return new_s.strip() |
85 | 149 |
|
86 | 150 | def transcribe_one(audio_path,language): |
87 | 151 |
|
88 | | - audio = whisper.load_audio(audio_path) |
89 | | - audio = whisper.pad_or_trim(audio) |
90 | | - mel = whisper.log_mel_spectrogram(audio).to(model.device) |
91 | | - _, probs = model.detect_language(mel) |
92 | | - language = max(probs, key=probs.get) |
| 152 | + model = AutoModel(model=model_dir, |
| 153 | + vad_model="fsmn-vad", |
| 154 | + vad_kwargs={"max_single_segment_time": 30000}, |
| 155 | + trust_remote_code=True, device="cuda:0") |
| 156 | + |
| 157 | + res = model.generate( |
| 158 | + input=audio_path, |
| 159 | + cache={}, |
| 160 | + language=language, # "zn", "en", "yue", "ja", "ko", "nospeech" |
| 161 | + use_itn=False, |
| 162 | + batch_size_s=0, |
| 163 | + ) |
| 164 | + |
| 165 | + try: |
93 | 166 |
|
94 | | - if language == "zh": |
95 | | - |
96 | | - rec_result = inference_pipeline(audio_in=audio_path, param_dict=param_dict) |
97 | | - elif language == "ja": |
98 | | - rec_result = inference_pipeline_ja(audio_in=audio_path, param_dict=param_dict) |
99 | | - else: |
100 | | - rec_result = inference_pipeline_en(audio_in=audio_path, param_dict=param_dict) |
| 167 | + text = res[0]["text"] |
| 168 | + text = format_str_v3(text) |
| 169 | + print(text) |
| 170 | + except Exception as e: |
| 171 | + print(e) |
| 172 | + text = "" |
101 | 173 |
|
102 | | - print(rec_result["text"]) |
103 | 174 |
|
104 | | - return rec_result["text"],language |
| 175 | + return text,language |
105 | 176 |
|
106 | 177 |
|
107 | 178 | if __name__ == "__main__": |
108 | 179 |
|
109 | 180 | parser = argparse.ArgumentParser() |
110 | 181 |
|
111 | 182 | parser.add_argument( |
112 | | - "--language", type=str, default="ja", choices=["ja", "en", "zh"] |
| 183 | + "--language", type=str, default="ja", choices=["ja", "en", "zh","yue","ko"] |
113 | 184 | ) |
114 | 185 | parser.add_argument("--model_name", type=str, required=True) |
115 | 186 |
|
@@ -159,6 +230,10 @@ def transcribe_one(audio_path,language): |
159 | 230 | language_id = "EN" |
160 | 231 | elif lang == "zh": |
161 | 232 | language_id = "ZH" |
| 233 | + elif lang == "yue": |
| 234 | + language_id = "YUE" |
| 235 | + elif lang == "ko": |
| 236 | + language_id = "KO" |
162 | 237 |
|
163 | 238 | f.write(file_pos+f"{file_name}|{extracted_name.replace('.wav','')}|{language_id}|{text}\n") |
164 | 239 |
|
|
0 commit comments