| 35 | | |
|---|
| | 43 | /** |
|---|
| | 44 | * Default settings |
|---|
| | 45 | */ |
|---|
| | 46 | #define DEF_RECALC_ON_VOICED 4000 /* Time to recalculate threshold |
|---|
| | 47 | in voiced condition, in ms */ |
|---|
| | 48 | #define DEF_RECALC_ON_SILENCE 2000 /* Time to recalculate threshold |
|---|
| | 49 | in silence condition, in ms. */ |
|---|
| | 50 | #define DEF_BEFORE_SILENCE 400 /* Silence time before really changing |
|---|
| | 51 | state into SILENCE, in ms. */ |
|---|
| | 52 | #define DEF_THRESHOLD 1000 /* Default threshold. */ |
|---|
| | 53 | |
|---|
| | 54 | /** |
|---|
| | 55 | * This enumeration specifies the states of the silence detector. |
|---|
| | 56 | */ |
|---|
| | 57 | enum pjmedia_silence_det_state { |
|---|
| | 58 | STATE_SILENCE, |
|---|
| | 59 | STATE_START_SILENCE, |
|---|
| | 60 | STATE_VOICED |
|---|
| | 61 | }; |
|---|
| 44 | | int mode; /**< VAD mode. */ |
|---|
| 45 | | unsigned ptime; /**< Frame time, in msec. */ |
|---|
| 46 | | |
|---|
| 47 | | unsigned min_signal_cnt; /**< # of signal frames.before talk burst */ |
|---|
| 48 | | unsigned min_silence_cnt; /**< # of silence frames before silence. */ |
|---|
| 49 | | unsigned recalc_cnt; /**< # of frames before adaptive recalc. */ |
|---|
| 50 | | |
|---|
| 51 | | pj_bool_t in_talk; /**< In talk burst? */ |
|---|
| 52 | | unsigned cur_cnt; /**< # of frames in current mode. */ |
|---|
| 53 | | unsigned signal_cnt; /**< # of signal frames received. */ |
|---|
| 54 | | unsigned silence_cnt; /**< # of silence frames received */ |
|---|
| 55 | | unsigned cur_threshold; /**< Current silence threshold. */ |
|---|
| 56 | | unsigned weakest_signal; /**< Weakest signal detected. */ |
|---|
| 57 | | unsigned loudest_silence; /**< Loudest silence detected. */ |
|---|
| | 70 | int mode; /**< VAD mode. */ |
|---|
| | 71 | unsigned ptime; /**< Frame time, in msec. */ |
|---|
| | 72 | |
|---|
| | 73 | unsigned threshold; /**< Current threshold level. */ |
|---|
| | 74 | unsigned sum_level; /**< Total sum of recent level. */ |
|---|
| | 75 | unsigned sum_cnt; /**< Number of level summed. */ |
|---|
| | 76 | unsigned silence_timer; /**< Silence condition timer. */ |
|---|
| | 77 | unsigned voiced_timer; /**< Voiced condition timer. */ |
|---|
| | 78 | |
|---|
| | 79 | enum pjmedia_silence_det_state state;/**< Silence detector state. */ |
|---|
| | 80 | unsigned recalc_on_voiced; /**< Setting of time to recalc |
|---|
| | 81 | threshold in voiced condition. */ |
|---|
| | 82 | unsigned recalc_on_silence; /**< Setting of time to recalc |
|---|
| | 83 | threshold in silence condition.*/ |
|---|
| | 84 | unsigned before_silence; /**< Setting of silence time before |
|---|
| | 85 | really changing state into SILENCE, |
|---|
| | 86 | in ms. */ |
|---|
| 140 | | if (min_silence == -1) |
|---|
| 141 | | min_silence = 500; |
|---|
| 142 | | if (min_signal < 0) |
|---|
| 143 | | min_signal = sd->ptime; |
|---|
| 144 | | if (recalc_time < 0) |
|---|
| 145 | | recalc_time = 2000; |
|---|
| 146 | | |
|---|
| 147 | | sd->min_signal_cnt = min_signal / sd->ptime; |
|---|
| 148 | | sd->min_silence_cnt = min_silence / sd->ptime; |
|---|
| 149 | | sd->recalc_cnt = recalc_time / sd->ptime; |
|---|
| | 163 | if (recalc_time1 < 0) |
|---|
| | 164 | recalc_time1 = DEF_RECALC_ON_VOICED; |
|---|
| | 165 | if (recalc_time2 < 0) |
|---|
| | 166 | recalc_time2 = DEF_RECALC_ON_SILENCE; |
|---|
| | 167 | if (before_silence < 0) |
|---|
| | 168 | before_silence = DEF_BEFORE_SILENCE; |
|---|
| | 169 | |
|---|
| | 170 | sd->recalc_on_voiced = recalc_time1; |
|---|
| | 171 | sd->recalc_on_silence = recalc_time2; |
|---|
| | 172 | sd->before_silence = before_silence; |
|---|
| 195 | | /* Convert PCM level to ulaw */ |
|---|
| 196 | | level = pjmedia_linear2ulaw(level) ^ 0xff; |
|---|
| 197 | | |
|---|
| 198 | | /* Do we have signal? */ |
|---|
| 199 | | have_signal = level > sd->cur_threshold; |
|---|
| 200 | | |
|---|
| 201 | | /* We we're in transition between silence and signel, increment the |
|---|
| 202 | | * current frame counter. We will only switch mode when we have enough |
|---|
| 203 | | * frames. |
|---|
| 204 | | */ |
|---|
| 205 | | if (sd->in_talk != have_signal) { |
|---|
| 206 | | unsigned limit; |
|---|
| 207 | | |
|---|
| 208 | | sd->cur_cnt++; |
|---|
| 209 | | |
|---|
| 210 | | limit = (sd->in_talk ? sd->min_silence_cnt : |
|---|
| 211 | | sd->min_signal_cnt); |
|---|
| 212 | | |
|---|
| 213 | | if (sd->cur_cnt > limit) { |
|---|
| 214 | | |
|---|
| 215 | | /* Swap mode */ |
|---|
| 216 | | sd->in_talk = !sd->in_talk; |
|---|
| 217 | | |
|---|
| 218 | | /* Restart adaptive cur_threshold measurements */ |
|---|
| 219 | | sd->weakest_signal = 0xFFFFFFFFUL; |
|---|
| 220 | | sd->loudest_silence = 0; |
|---|
| 221 | | sd->signal_cnt = 0; |
|---|
| 222 | | sd->silence_cnt = 0; |
|---|
| 223 | | sd->cur_cnt = 0; |
|---|
| | 217 | if (sd->mode == VAD_MODE_FIXED) |
|---|
| | 218 | return (level < sd->threshold); |
|---|
| | 219 | |
|---|
| | 220 | /* Calculating recent level */ |
|---|
| | 221 | sd->sum_level += level; |
|---|
| | 222 | ++sd->sum_cnt; |
|---|
| | 223 | avg_recent_level = (sd->sum_level / sd->sum_cnt); |
|---|
| | 224 | |
|---|
| | 225 | if (level > sd->threshold) { |
|---|
| | 226 | sd->silence_timer = 0; |
|---|
| | 227 | sd->voiced_timer += sd->ptime; |
|---|
| | 228 | |
|---|
| | 229 | switch(sd->state) { |
|---|
| | 230 | case STATE_VOICED: |
|---|
| | 231 | if (sd->voiced_timer > sd->recalc_on_voiced) { |
|---|
| | 232 | /* Voiced for long time (>recalc_on_voiced), current |
|---|
| | 233 | * threshold seems to be too low. |
|---|
| | 234 | */ |
|---|
| | 235 | sd->threshold = (avg_recent_level + sd->threshold) >> 1; |
|---|
| | 236 | TRACE_((THIS_FILE,"Re-adjust threshold (in talk burst)" |
|---|
| | 237 | "to %d", sd->threshold)); |
|---|
| | 238 | |
|---|
| | 239 | sd->voiced_timer = 0; |
|---|
| | 240 | |
|---|
| | 241 | /* Reset sig_level */ |
|---|
| | 242 | sd->sum_level = avg_recent_level; |
|---|
| | 243 | sd->sum_cnt = 1; |
|---|
| | 244 | } |
|---|
| | 245 | break; |
|---|
| | 246 | |
|---|
| | 247 | case STATE_SILENCE: |
|---|
| | 248 | TRACE_((THIS_FILE,"Starting talk burst (level=%d threshold=%d)", |
|---|
| | 249 | level, sd->threshold)); |
|---|
| | 250 | |
|---|
| | 251 | case STATE_START_SILENCE: |
|---|
| | 252 | sd->state = STATE_VOICED; |
|---|
| | 253 | |
|---|
| | 254 | /* Reset sig_level */ |
|---|
| | 255 | sd->sum_level = level; |
|---|
| | 256 | sd->sum_cnt = 1; |
|---|
| | 257 | |
|---|
| | 258 | break; |
|---|
| | 259 | |
|---|
| | 260 | default: |
|---|
| | 261 | pj_assert(0); |
|---|
| | 262 | break; |
|---|
| 227 | | /* Reset frame count */ |
|---|
| 228 | | sd->cur_cnt = 0; |
|---|
| | 265 | sd->voiced_timer = 0; |
|---|
| | 266 | sd->silence_timer += sd->ptime; |
|---|
| | 267 | |
|---|
| | 268 | switch(sd->state) { |
|---|
| | 269 | case STATE_SILENCE: |
|---|
| | 270 | if (sd->silence_timer >= sd->recalc_on_silence) { |
|---|
| | 271 | sd->threshold = avg_recent_level << 1; |
|---|
| | 272 | TRACE_((THIS_FILE,"Re-adjust threshold (in silence)" |
|---|
| | 273 | "to %d", sd->threshold)); |
|---|
| | 274 | |
|---|
| | 275 | sd->silence_timer = 0; |
|---|
| | 276 | |
|---|
| | 277 | /* Reset sig_level */ |
|---|
| | 278 | sd->sum_level = avg_recent_level; |
|---|
| | 279 | sd->sum_cnt = 1; |
|---|
| | 280 | } |
|---|
| | 281 | break; |
|---|
| | 282 | |
|---|
| | 283 | case STATE_VOICED: |
|---|
| | 284 | sd->state = STATE_START_SILENCE; |
|---|
| | 285 | |
|---|
| | 286 | /* Reset sig_level */ |
|---|
| | 287 | sd->sum_level = level; |
|---|
| | 288 | sd->sum_cnt = 1; |
|---|
| | 289 | |
|---|
| | 290 | case STATE_START_SILENCE: |
|---|
| | 291 | if (sd->silence_timer >= sd->before_silence) { |
|---|
| | 292 | sd->state = STATE_SILENCE; |
|---|
| | 293 | sd->threshold = avg_recent_level << 1; |
|---|
| | 294 | TRACE_((THIS_FILE,"Starting silence (level=%d " |
|---|
| | 295 | "threshold=%d)", level, sd->threshold)); |
|---|
| | 296 | |
|---|
| | 297 | /* Reset sig_level */ |
|---|
| | 298 | sd->sum_level = avg_recent_level; |
|---|
| | 299 | sd->sum_cnt = 1; |
|---|
| | 300 | } |
|---|
| | 301 | break; |
|---|
| | 302 | |
|---|
| | 303 | default: |
|---|
| | 304 | pj_assert(0); |
|---|
| | 305 | break; |
|---|
| | 306 | } |
|---|
| 230 | | |
|---|
| 231 | | |
|---|
| 232 | | /* Count the number of silent and signal frames and calculate min/max */ |
|---|
| 233 | | if (have_signal) { |
|---|
| 234 | | if (level < sd->weakest_signal) |
|---|
| 235 | | sd->weakest_signal = level; |
|---|
| 236 | | sd->signal_cnt++; |
|---|
| 237 | | } |
|---|
| 238 | | else { |
|---|
| 239 | | if (level > sd->loudest_silence) |
|---|
| 240 | | sd->loudest_silence = level; |
|---|
| 241 | | sd->silence_cnt++; |
|---|
| 242 | | } |
|---|
| 243 | | |
|---|
| 244 | | /* See if we have had enough frames to look at proportions of |
|---|
| 245 | | * silence/signal frames. |
|---|
| 246 | | */ |
|---|
| 247 | | if ((sd->signal_cnt + sd->silence_cnt) > sd->recalc_cnt) { |
|---|
| 248 | | |
|---|
| 249 | | if (sd->mode == VAD_MODE_ADAPTIVE) { |
|---|
| 250 | | pj_bool_t updated = PJ_TRUE; |
|---|
| 251 | | unsigned pct_signal, new_threshold = sd->cur_threshold; |
|---|
| 252 | | |
|---|
| 253 | | /* Get percentage of signal */ |
|---|
| 254 | | pct_signal = sd->signal_cnt * 100 / |
|---|
| 255 | | (sd->signal_cnt + sd->silence_cnt); |
|---|
| 256 | | |
|---|
| 257 | | /* Adjust according to signal/silence proportions. */ |
|---|
| 258 | | if (pct_signal > 95) { |
|---|
| 259 | | new_threshold += (sd->weakest_signal+1 - sd->cur_threshold)/2; |
|---|
| 260 | | } else if (pct_signal < 5) { |
|---|
| 261 | | new_threshold = (sd->cur_threshold+sd->loudest_silence)/2+1; |
|---|
| 262 | | } else if (pct_signal > 80) { |
|---|
| 263 | | new_threshold++; |
|---|
| 264 | | } else if (pct_signal < 10) { |
|---|
| 265 | | new_threshold--; |
|---|
| 266 | | } else { |
|---|
| 267 | | updated = PJ_FALSE; |
|---|
| 268 | | } |
|---|
| 269 | | |
|---|
| 270 | | if (new_threshold > PJMEDIA_SILENCE_DET_MAX_THRESHOLD) |
|---|
| 271 | | new_threshold = PJMEDIA_SILENCE_DET_MAX_THRESHOLD; |
|---|
| 272 | | |
|---|
| 273 | | if (updated && sd->cur_threshold != new_threshold) { |
|---|
| 274 | | PJ_LOG(5,(sd->objname, |
|---|
| 275 | | "Vad cur_threshold updated %d-->%d. " |
|---|
| 276 | | "Signal lo=%d", |
|---|
| 277 | | sd->cur_threshold, new_threshold, |
|---|
| 278 | | sd->weakest_signal)); |
|---|
| 279 | | sd->cur_threshold = new_threshold; |
|---|
| 280 | | } |
|---|
| 281 | | } |
|---|
| 282 | | |
|---|
| 283 | | /* Reset. */ |
|---|
| 284 | | sd->weakest_signal = 0xFFFFFFFFUL; |
|---|
| 285 | | sd->loudest_silence = 0; |
|---|
| 286 | | sd->signal_cnt = 0; |
|---|
| 287 | | sd->silence_cnt = 0; |
|---|
| 288 | | } |
|---|
| 289 | | |
|---|
| 290 | | return !sd->in_talk; |
|---|
| 291 | | |
|---|
| | 308 | |
|---|
| | 309 | return (sd->state == STATE_SILENCE); |
|---|