<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.0 20120330//EN" "JATS-journalpublishing1.dtd">
<article article-type="case-report" dtd-version="1.0" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">KJORL</journal-id>
<journal-title-group>
<journal-title>Korean Journal of Otorhinolaryngology-Head and Neck Surgery</journal-title><abbrev-journal-title>Korean J Otorhinolaryngol-Head Neck Surg</abbrev-journal-title></journal-title-group>
<issn pub-type="ppub">2092-5859</issn>
<issn pub-type="epub">2092-6529</issn>
<publisher>
<publisher-name>Korean Society of Otorhinolaryngology-Head and Neck Surgery</publisher-name></publisher></journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3342/kjorl-hns.2026.00052</article-id>
<article-id pub-id-type="publisher-id">kjorl-hns-2026-00052</article-id>
<article-categories>
<subj-group>
<subject>Case Report</subject></subj-group></article-categories>
<title-group>
<article-title>Clinical Applicability of Whisper-Based Automatic Transcription for Korean Speech Audiometry Sentence Tests in Older Adults with Hearing Loss</article-title>
<trans-title-group>
<trans-title xml:lang="ko">난청 노인의 Korean Speech Audiometry 문장 검사에 대한 Whisper 기반 자동 전사의 
임상적 적용 가능성</trans-title>
</trans-title-group>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">http://orcid.org/0000-0003-1623-9676</contrib-id>
<name-alternatives>
<name name-style="western" xml:lang="en"><surname>Han</surname><given-names>Woojae</given-names></name>
<name name-style="eastern" xml:lang="ko"><surname>한</surname><given-names>우재</given-names></name>
</name-alternatives>
<xref ref-type="corresp" rid="c1-kjorl-hns-2026-00052"/>
<xref ref-type="aff" rid="af1-kjorl-hns-2026-00052"><sup>1</sup></xref>
<xref ref-type="aff" rid="af2-kjorl-hns-2026-00052"><sup>2</sup></xref>
<xref ref-type="aff" rid="af3-kjorl-hns-2026-00052"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">http://orcid.org/0009-0001-0210-2372</contrib-id>
<name-alternatives>
<name name-style="western" xml:lang="en"><surname>Ma</surname><given-names>Sunmi</given-names></name>
<name name-style="eastern" xml:lang="ko"><surname>마</surname><given-names>선미</given-names></name>
</name-alternatives>
<xref ref-type="aff" rid="af1-kjorl-hns-2026-00052"><sup>1</sup></xref>
<xref ref-type="aff" rid="af2-kjorl-hns-2026-00052"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">http://orcid.org/0009-0008-4051-1848</contrib-id>
<name-alternatives>
<name name-style="western" xml:lang="en"><surname>Park</surname><given-names>Sangmin</given-names></name>
<name name-style="eastern" xml:lang="ko"><surname>박</surname><given-names>상민</given-names></name>
</name-alternatives>
<xref ref-type="aff" rid="af1-kjorl-hns-2026-00052"><sup>1</sup></xref>
<xref ref-type="aff" rid="af2-kjorl-hns-2026-00052"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<contrib-id contrib-id-type="orcid">http://orcid.org/0000-0002-1338-4897</contrib-id>
<name-alternatives>
<name name-style="western" xml:lang="en"><surname>Yoon</surname><given-names>Tae-Jin</given-names></name>
<name name-style="eastern" xml:lang="ko"><surname>윤</surname><given-names>태진</given-names></name>
</name-alternatives>
<xref ref-type="aff" rid="af4-kjorl-hns-2026-00052"><sup>4</sup></xref>
</contrib>
<aff-alternatives id="af1-kjorl-hns-2026-00052">
<aff xml:lang="en"><label>1</label>Laboratory of Hearing and Technology, College of Natural Sciences, Hallym University, Chuncheon, <country>Korea</country></aff>
<aff xml:lang="ko"><label>1</label>한림대학교 자연과학대학  청각과학기술연구실</aff>
</aff-alternatives>
<aff-alternatives id="af2-kjorl-hns-2026-00052">
<aff xml:lang="en"><label>2</label>Division of Speech Pathology and Audiology, College of Natural Sciences, Hallym University, Chuncheon , <country>Korea</country></aff>
<aff xml:lang="ko"><label>2</label>한림대학교 자연과학대학 언어청각학부</aff>
</aff-alternatives>
<aff-alternatives id="af3-kjorl-hns-2026-00052">
<aff xml:lang="en"><label>3</label>Research Institute of Audiology and Speech Pathology, College of Natural Sciences, Hallym University, Chuncheon, <country>Korea</country></aff>
<aff xml:lang="ko"><label>3</label>한림대학교 자연과학대학 청각언어연구소</aff>
</aff-alternatives>
<aff-alternatives id="af4-kjorl-hns-2026-00052">
<aff xml:lang="en"><label>4</label>Department of English Language and Literature, Sungshin Women&#x02019;s University, Seoul, <country>Korea</country></aff>
<aff xml:lang="ko"><label>4</label>성신여자대학교 영어영문학과</aff>
</aff-alternatives>
</contrib-group>
<author-notes>
<corresp id="c1-kjorl-hns-2026-00052">Address for correspondence Woojae Han, PhD Division of Speech Pathology and Audiology, College of Natural Sciences, Hallym University, 1 Hallymdaehak-gil, Chunchon 24252, Korea Tel +82-33-248-2216 E-mail <email>woojaehan@hallym.ac.kr</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>5</month>
<year>2026</year></pub-date>
<elocation-id>kjorl-hns.2026.00052</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>01</month>
<year>2026</year></date>
<date date-type="rev-recd">
<day>10</day>
<month>03</month>
<year>2026</year></date>
<date date-type="accepted">
<day>17</day>
<month>03</month>
<year>2026</year></date>
</history>
<permissions>
<copyright-statement>Copyright &#x000a9; 2026  Korean Society of Otorhinolaryngology-Head and Neck Surgery</copyright-statement>
<copyright-year>2026</copyright-year>
<license>
<license-p>This is an open access article distributed under the terms of the Creative Commons Attribution Non-Commercial License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by-nc/4.0">http://creativecommons.org/licenses/by-nc/4.0</ext-link>), which permits unrestricted non-commercial use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p></license></permissions>
<abstract><p> Presbycusis often disrupts sentence-level communication, while clinical speech audiometry still depends on labor-intensive scoring. This exploratory two-case study examined whether the large-scale automatic speech recognition (ASR) model Whisper large-v3 can reliably transcribe sentence recognition performances in elderly listeners with presbycusis and how age-related auditory and speech characteristics shape its error patterns. Two native Korean elderly listeners with symmetric sensorineural hearing loss (S-001: moderate steeply sloping; S-002: mild sloping highfrequency loss) completed the Korean Speech Audiometry (KSA) sentence test (80 sentences). Their repetitions in the quiet were recorded and independently transcribed by four experienced audiologists. Expert transcriptions were compared with Whisper outputs generated under fixed decoding parameters and standardized text normalization, using sentence match rate, word error rate (WER), and character error rate (CER). Whisper showed relatively low CER (about 8%-15%) but substantially higher word- and sentence-level errors (WER 30% for S-001 vs. 18% for S-002; sentence match 38.5% vs. 71.2%). Errors clustered in the high-frequency fricatives/affricates, final consonants, low-frequency and polysyllabic words, and longer syntactically complex sentences. Better clinical speech audiometry scores (KSA sentence/word recognition and word recognition score) were associated with higher ASR sentence match rates and lower WER/CER across the two cases. Generic ASR partially agreed with expert transcriptions, suggesting potential as a complementary tool, but elderly-and hearing loss-tailored ASR models and test designs are needed for reliable AI-based sentence recognition.
</p></abstract>
<kwd-group>
<kwd>Audiometry</kwd>
<kwd>Hearing loss</kwd>
<kwd>Presbycusis</kwd>
<kwd>Speech recognition software</kwd>
</kwd-group>
</article-meta></front>
<body>
<sec>
<title>Introduction</title>
<p>Presbycusis is characterized primarily by sensorineural hearing loss in the high-frequency range and is associated with reduced understanding of everyday conversation, social isolation, and an increased risk of cognitive decline &#x0005b;<xref ref-type="bibr" rid="b1-kjorl-hns-2026-00052">1</xref>,<xref ref-type="bibr" rid="b2-kjorl-hns-2026-00052">2</xref>&#x0005d;. Because real-world communication is more closely related to sentence-level comprehension than to the recognition of in dividual words, assessment of sentence recognition ability is clinically important &#x0005b;<xref ref-type="bibr" rid="b3-kjorl-hns-2026-00052">3</xref>,<xref ref-type="bibr" rid="b4-kjorl-hns-2026-00052">4</xref>&#x0005d;.</p>
<p>However, the sentence test in the Korean Speech Audiometry (KSA), which is widely used in Korea, relies entirely on manual stimulus presentation, response transcription, and scoring by the examiner, resulting in substantial time and labor demands &#x0005b;<xref ref-type="bibr" rid="b5-kjorl-hns-2026-00052">5</xref>&#x0005d;. Maintaining consistent scoring across examiners can also be challenging. For these reasons, simpler wordbased tests are often preferred over sentence tests in clinical practice, or sentence test scores are used only to a limited extent.</p>
<p>In recent years, deep learning-based automatic speech recognition (ASR) has rapidly expanded into a wide range of services, including smartphone voice assistants and real-time captioning, and its potential applications in medical and rehabilitation settings have also been actively discussed &#x0005b;<xref ref-type="bibr" rid="b6-kjorl-hns-2026-00052">6</xref>-<xref ref-type="bibr" rid="b8-kjorl-hns-2026-00052">8</xref>&#x0005d;. Whisper large-v3 (OpenAI) is a large-scale multilingual ASR model first released in 2022. It was trained on extensive multilingual and multidomain data. Furthermore, it is reported to show strong transcription performance even in noisy environments, demonstrating a measurable level of performance in Korean without additional training &#x0005b;<xref ref-type="bibr" rid="b8-kjorl-hns-2026-00052">8</xref>&#x0005d;. More recently, studies have evaluated ASR performance in vulnerable populations, including individuals with hearing loss, older adults, and residents of long-term care facilities, and have explored its integration into clinical and welfare services &#x0005b;<xref ref-type="bibr" rid="b9-kjorl-hns-2026-00052">9</xref>&#x0005d;. Accordingly, the potential role of this technology should also be considered in auditory rehabilitation &#x0005b;<xref ref-type="bibr" rid="b10-kjorl-hns-2026-00052">10</xref>&#x0005d;.</p>
<p>Nevertheless, most ASR systems have been trained primarily on speech from adults with normal hearing, and performance degradation has been reported in out-of-distribution groups such as older speakers and speakers with hearing loss &#x0005b;<xref ref-type="bibr" rid="b8-kjorl-hns-2026-00052">8</xref>,<xref ref-type="bibr" rid="b9-kjorl-hns-2026-00052">9</xref>&#x0005d;. Furthermore, research and data involving Korean older adults with hearing loss remain very limited.</p>
<p>If ASR can reliably transcribe KSA sentence test utterances produced by older adults with hearing loss, it may help automate part of speech audiometry and may also serve as a tool for tracking changes in sentence recognition in home-based or remote settings&#x0005b;<xref ref-type="bibr" rid="b10-kjorl-hns-2026-00052">10</xref>&#x0005d;. At the same time, identifying which errors occur most frequently in high-frequency consonants, lowfrequency or long-syllable words, and long or complex sentences may become a key task for developing ASR models tailored to older adults with hearing loss and for designing AI-based sentence tests &#x0005b;<xref ref-type="bibr" rid="b9-kjorl-hns-2026-00052">9</xref>&#x0005d;. Despite this potential, there have been no domestic reports in which speech from older adults with hearing loss was transcribed using a general-purpose ASR system based on the KSA sentence test and then quantitatively compared with clinical speech audiometric indices. In this case study, we administered the KSA sentence test to two patients with presbycusis and performed automatic transcription using Whisper large-v3. We aimed to explore the extent to which a general-purpose ASR system agrees with audiologist transcription and KSA indices, to describe the major error patterns observed in presbycusis, and to discuss the potential of Korean-language ASR transcription as an adjunctive clinical tool for future evaluation of sentence recognition in patients with presbycusis.</p>
</sec>
<sec>
<title>Case</title>
<p>The study participants were two native Korean-speaking patients with presbycusis who visited Hallym Speech-Hearing Center: S-001, an 87-year-old man, and S-002, a 78-year-old woman. Both participants showed bilateral sensorineural hearing loss on pure-tone audiometry and bilateral type A tympanograms on immittance testing, indicating normal middle-ear function.</p>
<p>Specifically, in S-001, the right ear air-conduction thresholds ranged from 50-85 dB HL across 0.25-8 kHz, with a four-frequency pure-tone average across 0.5-4 kHz of 61.25 dB HL. The left ear ranged from 45-85 dB HL, with a fourfrequency average of 57.5 dB HL, corresponding to bilateral moderate-to-severe hearing loss. In S-002, the right ear airconduction thresholds ranged from 20-60 dB HL, with a four-frequency average of 36.25 dB HL. The left ear ranged from 25-60 dB HL, with a four-frequency average of 40 dB HL, indicating bilateral moderate hearing loss.</p>
<p>On the Korean Mini-Mental State Examination &#x0005b;<xref ref-type="bibr" rid="b11-kjorl-hns-2026-00052">11</xref>&#x0005d;, S-001 scored 25 and S-002 scored 24. Overall cognitive function was considered relatively preserved. Both participants had attained at least higher education, including experience studying abroad. Regarding hearing-assistive device history, S-001 had been wearing bilateral hearing aids for approximately 2 months, whereas S-002 reported no prior hearing-aid use. All procedures were approved by the Hallym University Institutional Review Board, and written informed consent was obtained from the participants before study participation. The auditory stimuli consisted of recordings by a standardized female speaker (announcer) from the KSA sentence test, a standard adult measure designed to balance sentence length, lexical difficulty, and phoneme distribution. A total of 80 sentences were used. Testing was conducted in a quiet laboratory with background noise below 30 dBA, and the presentation level was set at each participant&#x02019;s most comfortable level.</p>
<p>Each sentence was presented only once, and the participants were instructed to repeat it immediately, &#x0201c;exactly as you hear it.&#x0201d; All utterances were recorded using a digital recorder at 44.1 kHz, 16-bit, mono.</p>
<p>ASR and transcription were performed using the Korean version of Whisper large-v3, a large-scale general-purpose ASR model released by OpenAI. Automatic language detection was enabled during model execution, although the likelihood of language identification error was considered low because all input speech was in Korean. Because speech produced by older adults with presbycusis may show lower articulatory accuracy and greater variability than that of adults with normal hearing, a conservative decoding strategy was adopted to ensure consistency of the transcription results.</p>
<p>Specifically, beam search (beam size&#x0003d;5) was combined with greedy decoding (temperature&#x0003d;0.0). The best&#x0005f;of value was set to 5 so that the sequence with the highest log-probability among the candidate transcriptions would be selected. In addition, to account for cases in which the confidence criterion was not met during the initial decoding stage, a temperature fallback strategy was used in which low temperature values (0.0, 0.2, 0.4) were applied sequentially. This approach was intended to yield relatively stable transcription results even in segments with substantial speech distortion or acoustic uncertainty &#x0005b;<xref ref-type="bibr" rid="b12-kjorl-hns-2026-00052">12</xref>&#x0005d;.</p>
<p>Four audiologists with MS- or PhD-level training independently transcribed all utterances. To evaluate transcription reliability, sentence-level agreement among the four transcribers was additionally calculated, and inter-transcriber agreement was assessed using percentage agreement. At the sentence level, the most frequent string among the five transcriptions was defined as the majority transcription, and sentence agreement rates (calculated both with and without strict whitespace matching) were based on agreement with this transcription. At the word level, substitutions, deletions, and insertions were distinguished through token alignment, and the word error rate (WER) was calculated. Whisper performance was defined as the mean of the expert-specific WER values (AI-to-human mean WER). At the character level, the character error rate (CER) was calculated using Levenshtein distance after removing whitespace &#x0005b;<xref ref-type="bibr" rid="b12-kjorl-hns-2026-00052">12</xref>&#x0005d;. The relationships among the KSA sentence and word recognition scores (WRSs), the speech reception threshold, the WRS, and the AI-derived metrics were compared descriptively at the case level.</p>
<p>At the sentence level, Whisper transcription showed a clear performance difference between the two participants (<xref rid="t1-kjorl-hns-2026-00052" ref-type="table">Table 1</xref>). For S-001, the sentence agreement rate was 38.5% when whitespace was ignored and 28.2% under strict whitespace matching, indicating that more than half of the presented sentences did not match the majority transcription. In contrast, S-002 showed corresponding values of 71.2% and 63.7%, indicating relatively stable sentence-level transcription performance for the same sentence set. Sentence-level agreement among the four audiologists exceeded 90% in both cases, demonstrating very high inter-transcriber agreement. This suggests that the expert transcriptions were highly reliable and that variability among human transcribers was minimal, whereas agreement with Whisper transcription depended strongly on speaker characteristics.</p>
<p>At the word level, a meaningful degree of error was observed in both cases, although the magnitude differed clearly. The WER for S-001 was 0.2985, indicating that approximately 30% of words contained at least one substitution, deletion, or insertion error, whereas the WER for S-002 was 0.1802, corresponding to approximately 18% (<xref rid="t2-kjorl-hns-2026-00052" ref-type="table">Table 2</xref>). In both cases, substitution errors accounted for most of the error profile, indicating that Whisper more often misrecognized words as other words than completely missed them or produced false insertions. Deletion and insertion errors were relatively less frequent, but they occurred more often in S-001 than in S-002. In the context of these two cases, this observation tentatively suggests that severe hearing loss and reduced speech clarity may also negatively affect word-level stability. In contrast, the WER of expert transcription was only 1%-2% in both cases, confirming that, for the same speech samples, Whisper had a WER approximately 5-10 times higher than that of the experts.</p>
<p>At the character level, Whisper transcription showed relatively good performance, but it still warrants consideration for clinical application. The CER was 0.1486 for S-001 and 0.0811 for S-002, which were lower than the word-level metrics but still corresponded to errors in approximately 1 out of every 10 characters. This suggests that, although Whisper preserved recognition of individual phonemes or graphemes to some extent, additional errors accumulated at higher linguistic levels, such as word boundary detection, whitespace handling, and compound-word segmentation. Given that the CER of expert transcription was essentially close to 0, these character-level errors might also be viewed as an ASR-specific structural limitation.</p>
<p>A consistent pattern was also observed with speech audiometric measures (<xref rid="t3-kjorl-hns-2026-00052" ref-type="table">Table 3</xref>). S-001 showed relatively poor speech recognition performance, with 40% sentence recognition on the KSA, 60% word recognition, and a WRS of 68%, and this pattern was accompanied by a low sentence agreement rate and high WER and CER in Whisper transcription. In contrast, S-002 had 80% sentence recognition, 88% word recognition, and a WRS of 84%, and likewise showed a high sentence agreement rate and low WER and CER in Whisper transcription. Although no statistical testing was performed because of the small sample size, both cases showed a parallel pattern in which better clinical speech recognition performance was associated with better ASR transcription performance, cautiously suggesting that conventional KSA indices may serve as indirect indicators of variation in ASR performance in older adults with hearing loss.</p>
<p>In the detailed error-pattern analysis, Whisper showed vulnerability in both speakers to high-frequency fricatives and affricates (e.g., /ㅅ, ㅆ, ㅈ, ㅊ/) and to word-final consonants. In particular, for S-001, the error rate for high-frequency consonants was several times higher than the mean error rate of expert transcribers, and the WER increased prominently for low-frequency words, long words of three or more syllables, and words containing high-frequency consonants. At the sentence level, the error rate also increased for long sentences containing many words and for sentences with greater syntactic complexity, including conditional clauses and directive or imperative forms, suggesting a tendency for Whisper transcription of speech from older adults with hearing loss to become less stable as linguistic load increased.</p>
</sec>
<sec>
<title>Discussion</title>
<p>In this study, we compared the KSA sentence test with automatic transcription by Whisper large-v3 in two speakers with presbycusis to explore the extent to which a general-purpose ASR system can support sentence recognition assessment in older adults with hearing loss. Although the CER was relatively favorable at around 10%, performance at the word and sentence levels was markedly poorer, particularly in S-001, who showed moderate-to-severe, steeply sloping hearing loss, with a WER of approximately 30% and a sentence agreement rate below 40%. This contrasted clearly with S-002, whose hearing was relatively better. These findings are consistent with previous reports showing that ASR recognition accuracy is lower in older adults and speakers with hearing loss than in typical adult speakers &#x0005b;<xref ref-type="bibr" rid="b9-kjorl-hns-2026-00052">9</xref>&#x0005d;, and they suggest that Whisper also has structural limitations for older speakers with hearing loss.</p>
<p>The parallel pattern between conventional speech audiometric measures, including KSA sentence and word recognition and WRS, and Whisper-derived indices is clinically meaningful. In both cases, better speech recognition performance was accompanied by a higher sentence agreement rate and lower WER and CER in ASR transcription, whereas poorer speech recognition performance was accompanied by reduced AI transcription performance. This suggests that, in its current form, a general-purpose ASR system may be more useful as an auxiliary indicator that complements conventional speech audiometry than as an independent testing tool that replaces the KSA &#x0005b;<xref ref-type="bibr" rid="b12-kjorl-hns-2026-00052">12</xref>&#x0005d;. More specifically, it may help indirectly characterize speaker-specific speech clarity, vulnerability across frequency regions, and performance changes associated with increasing linguistic load &#x0005b;<xref ref-type="bibr" rid="b12-kjorl-hns-2026-00052">12</xref>&#x0005d;. For example, even among patients with the same KSA scores, comparison of ASR error patterns may allow more detailed identification of additional vulnerable areas, such as high-frequency consonants, low-frequency or long-syllable words, and complex sentences.</p>
<p>The detailed error patterns provide direct implications for the future design of ASR systems tailored to older adults with hearing loss and of AI-based sentence tests. Whisper showed concentrated errors in high-frequency fricatives and affricates, word-final consonants, low-frequency words, long words of three or more syllables, and long, syntactically complex sentences. This indicates that acoustic and articulatory characteristics typically observed in presbycusis, such as high-frequency hearing loss, reduced vocal intensity, and weakening of word-final consonants &#x0005b;<xref ref-type="bibr" rid="b13-kjorl-hns-2026-00052">13</xref>&#x0005d;, may conflict with the assumptions of the acoustic and language models used in current general-purpose ASR systems &#x0005b;<xref ref-type="bibr" rid="b14-kjorl-hns-2026-00052">14</xref>&#x0005d;. Therefore, rather than applying the existing KSA sentence lists for older adults with hearing loss directly to AI-based evaluation, it may be necessary to reconfigure difficulty on the basis of lexical frequency, syllable structure, and syntactic complexity and to use words containing high-frequency consonants or word-final consonants as separate subindices &#x0005b;<xref ref-type="bibr" rid="b15-kjorl-hns-2026-00052">15</xref>&#x0005d;. Such structured, error-based indices may provide clinically richer information than simple accuracy rates.</p>
<p>This study has several limitations. First, the number of cases was very small, with only two participants. Accordingly, the present findings should be interpreted as exploratory observations at the hypothesis-generating stage rather than as findings generalizable to a broader population. In addition, because of the very limited sample size, we did not perform a statistical comparison between ASR transcription and expert transcription, and the interpretation of the results was based on exploratory descriptive comparisons. In particular, it is difficult to exclude the possibility that the difference in the degree of hearing loss between the two participants influenced the difference in Whisper&#x02019;s transcription performance. Second, because the study was conducted under relatively idealized conditions, including a quiet test room, a standardized female speaker, and a single ASR model, performance is likely to decline further in real outpatient or home environments, where background noise, dialect or accent, hearing-aid or cochlear implant use, and changes in speaking rate may be present. Third, we did not apply domain-specific fine-tuning or additional pre- and post-processing to improve Whisper transcription performance. Exploring data augmentation strategies that incorporate speech from older adults or individuals with disabilities, along with domain-specific training, could be a potential direction for improving ASR performance. Future studies should therefore validate the clinical validity and reliability of an ASR model specifically designed for presbycusis by constructing corpora that include speech from older adults and speakers with hearing loss, fine-tuning models to reflect KSA-based sentence and lexical structures, and developing adaptive pre- and post-processing algorithms for changes in vocal intensity and speaking rate. Future research will also require quantitative and statistical comparisons between ASR transcription and expert transcription in larger samples with a wider range of hearing characteristics.</p>
<p>This study provides preliminary insights by directly combining the KSA sentence test with a large-scale general-purpose ASR system. Specifically, it quantitatively presents Whisper&#x02019;s performance and error patterns for actual clinical speech produced by speakers with presbycusis. Future studies should include larger samples, diverse hearing-loss configurations, hearing-assistive device use, noisy environments, and multiple ASR systems to systematically clarify the relationship between KSA indices and AI-derived transcription metrics. In addition, several steps are necessary to develop an ASR-based sentence test tailored to presbycusis that reflects the structure of the KSA. These include constructing corpora that reflect speech from older adults and speakers with hearing loss, performing model fine-tuning, and developing pre- and post-processing algorithms that account for changes in vocal intensity, speaking rate, and intonation. If these technical foundations are established, such a system may eventually serve as a practical tool for the automation of clinical speech audiometry and for remote auditory assessment.</p>
</sec>
</body>
<back>
<sec sec-type="supplementary-material"><title>Supplementary Materials</title>
<p>Korean translation of this article is available with the Online-only Data Supplement at <ext-link xlink:href="https://doi.org/10.3342/kjorl-hns.2026.00052" ext-link-type="uri">https://doi.org/10.3342/kjorl-hns.2026.00052</ext-link>.</p>
</sec>
<fn-group>
<fn fn-type="other"><p><bold>Acknowledgments</bold></p><p>None</p></fn>
<fn fn-type="participating-researchers"><p><bold>Author Contribution</bold></p>
<p>Conceptualization: Woojae Han. Data curation: Sunmi Ma and Sangmin Park. Formal analysis: Tae-Jin Yoon. Funding acquisition: Woojae Han and Tae-Jin Yoon. Methodology: all authors. Project administration: Woojae Han. Resources: Woojae Han. Writing&#x02014;original draft: Woojae Han. Writing&#x02014;review &amp; editing: all authors.</p></fn>
</fn-group>
<ref-list>
<title>REFERENCES</title>
<ref id="b1-kjorl-hns-2026-00052">
<label>1</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lin</surname><given-names>FR</given-names></name>
<name><surname>Yaffe</surname><given-names>K</given-names></name>
<name><surname>Xia</surname><given-names>J</given-names></name>
<name><surname>Xue</surname><given-names>QL</given-names></name>
<name><surname>Harris</surname><given-names>TB</given-names></name>
<name><surname>Purchase-Helzner</surname><given-names>E</given-names></name>
<etal/>
</person-group>
<article-title>Hearing loss and cognitive decline in older adults</article-title>
<source>JAMA Intern Med</source>
<year>2013</year>
<volume>173</volume>
<issue>4</issue>
<fpage>293</fpage>
<lpage>9</lpage>
</element-citation></ref>
<ref id="b2-kjorl-hns-2026-00052">
<label>2</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Bugannim</surname><given-names>Y</given-names></name>
<name><surname>Roziner</surname><given-names>I</given-names></name>
<name><surname>Kishon-Rabin</surname><given-names>L</given-names></name>
</person-group>
<article-title>Speech recognition in noise across the life span with cognition and hearing sensitivity as mediators of age effects</article-title>
<source>Sci Rep</source>
<year>2025</year>
<volume>15</volume>
<issue>1</issue>
<fpage>20575</fpage>
</element-citation></ref>
<ref id="b3-kjorl-hns-2026-00052">
<label>3</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kwak</surname><given-names>C</given-names></name>
<name><surname>Han</surname><given-names>W</given-names></name>
</person-group>
<article-title>Age-related difficulty of listening effort in elderly</article-title>
<source>Int J Environ Res Public Health</source>
<year>2021</year>
<volume>18</volume>
<issue>16</issue>
<fpage>8845</fpage>
</element-citation></ref>
<ref id="b4-kjorl-hns-2026-00052">
<label>4</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Allen</surname><given-names>JB</given-names></name>
</person-group>
<article-title>Consonant recognition and the articulation index</article-title>
<source>J Acoust Soc Am</source>
<year>2005</year>
<volume>117</volume>
<issue>4 Pt 1</issue>
<fpage>2212</fpage>
<lpage>23</lpage>
</element-citation></ref>
<ref id="b5-kjorl-hns-2026-00052">
<label>5</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jang</surname><given-names>H</given-names></name>
<name><surname>Lee</surname><given-names>J</given-names></name>
<name><surname>Lim</surname><given-names>D</given-names></name>
<name><surname>Lee</surname><given-names>K</given-names></name>
<name><surname>Jeon</surname><given-names>A</given-names></name>
<name><surname>Jung</surname><given-names>E</given-names></name>
</person-group>
<article-title>[Development of Korean standard sentence lists for sentence recognition tests]</article-title>
<source>Audiology</source>
<year>2008</year>
<volume>4</volume>
<issue>2</issue>
<fpage>161</fpage>
<lpage>77</lpage>
<comment>Korean</comment>
</element-citation></ref>
<ref id="b6-kjorl-hns-2026-00052">
<label>6</label>
<element-citation publication-type="unknown">
<comment>Meyer BT, Kollmeier B, Ooster J. Autonomous measurement of speech intelligibility utilizing automatic speech recognition [online] 2015 [cited 2025 December 28]. Available from: URL: <ext-link xlink:href="https:// www.isca-archive.org/interspeech&#x0005f;2015/meyer15&#x0005f;interspeech.pdf" ext-link-type="uri">https:// www.isca-archive.org/interspeech&#x0005f;2015/meyer15&#x0005f;interspeech.pdf</ext-link></comment>
</element-citation></ref>
<ref id="b7-kjorl-hns-2026-00052">
<label>7</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Radford</surname><given-names>A</given-names></name>
<name><surname>Kim</surname><given-names>JW</given-names></name>
<name><surname>Xu</surname><given-names>T</given-names></name>
<name><surname>Brockman</surname><given-names>G</given-names></name>
<name><surname>McLeavey</surname><given-names>C</given-names></name>
<name><surname>Sutskever</surname><given-names>I</given-names></name>
</person-group>
<article-title>Robust speech recognition via large-scale weak supervision</article-title>
<source>arXiv [Preprint]</source>
<year>2022</year>
<comment>[cited 2025 December 28]. Available from: URL: <ext-link xlink:href="https://doi.org/10.48550/arXiv.2212.04356" ext-link-type="uri">https://doi.org/10.48550/arXiv.2212.04356</ext-link></comment>
</element-citation></ref>
<ref id="b8-kjorl-hns-2026-00052">
<label>8</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhao</surname><given-names>R</given-names></name>
<name><surname>Choi</surname><given-names>ASG</given-names></name>
<name><surname>Koenecke</surname><given-names>A</given-names></name>
<name><surname>Rameau</surname><given-names>A</given-names></name>
</person-group>
<article-title>Quantification of automatic speech recognition system performance on d/Deaf and hard of hearing speech</article-title>
<source>Laryngoscope</source>
<year>2025</year>
<volume>135</volume>
<issue>1</issue>
<fpage>191</fpage>
<lpage>7</lpage>
</element-citation></ref>
<ref id="b9-kjorl-hns-2026-00052">
<label>9</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname><given-names>L</given-names></name>
<name><surname>Asgari</surname><given-names>M</given-names></name>
</person-group>
<article-title>Refining automatic speech recognition system for older adults</article-title>
<source>Proc IEEE Int Conf Acoust Speech Signal Process</source>
<year>2021</year>
<volume>2021</volume>
<fpage>7003</fpage>
<lpage>7</lpage>
</element-citation></ref>
<ref id="b10-kjorl-hns-2026-00052">
<label>10</label>
<element-citation publication-type="unknown">
<comment>Xu M, Shao J, Wang L. Effects of aging and age-related hearing loss on talker discrimination. In: Proceedings of the 22nd Annual Conference of the International Speech Communication Association (INTERSPEECH 2021); 2021 Aug 30&#x02013;Sep 3; Brno, Czech Republic. International Speech Communication Association; 2021:1728-32. <ext-link xlink:href="https://doi.org/10.21437/Interspeech.2021-682" ext-link-type="uri">https://doi.org/10.21437/Interspeech.2021-682</ext-link></comment>
</element-citation></ref>
<ref id="b11-kjorl-hns-2026-00052">
<label>11</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Kang</surname><given-names>Y</given-names></name>
<name><surname>Na</surname><given-names>DL</given-names></name>
<name><surname>Hahn</surname><given-names>S</given-names></name>
</person-group>
<article-title>[A validity study on the Korean minimental state examination (K-MMSE) in dementia patients]</article-title>
<source>J Korean Neurol Assoc</source>
<year>1997</year>
<volume>15</volume>
<issue>2</issue>
<fpage>300</fpage>
<lpage>8</lpage>
<comment>Korean</comment>
</element-citation></ref>
<ref id="b12-kjorl-hns-2026-00052">
<label>12</label>
<element-citation publication-type="unknown">
<comment>Ney H, Haeb-Umbach R, Tran BH, Oerder M. Improvements in beam search for 10000-word continuous speech recognition [online] 1992 [cited 2026 January 2]. Available from: URL: <ext-link xlink:href="http://doi. org/10.1109/ICASSP.1992.225985" ext-link-type="uri">http://doi. org/10.1109/ICASSP.1992.225985</ext-link></comment>
</element-citation></ref>
<ref id="b13-kjorl-hns-2026-00052">
<label>13</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lee</surname><given-names>SJ</given-names></name>
<name><surname>Cho</surname><given-names>Y</given-names></name>
<name><surname>Song</surname><given-names>JY</given-names></name>
<name><surname>Lee</surname><given-names>D</given-names></name>
<name><surname>Kim</surname><given-names>Y</given-names></name>
<name><surname>Kim</surname><given-names>H</given-names></name>
</person-group>
<article-title>Aging effect on Korean female voice: acoustic and perceptual examinations of breathiness</article-title>
<source>Folia Phoniatr Logop</source>
<year>2015</year>
<volume>67</volume>
<issue>6</issue>
<fpage>300</fpage>
<lpage>7</lpage>
</element-citation></ref>
<ref id="b14-kjorl-hns-2026-00052">
<label>14</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hacking</surname><given-names>C</given-names></name>
<name><surname>Verbeek</surname><given-names>H</given-names></name>
<name><surname>Hamers</surname><given-names>JPH</given-names></name>
<name><surname>Aarts</surname><given-names>S</given-names></name>
</person-group>
<article-title>The development of an automatic speech recognition model using interview data from long-term care for older adults</article-title>
<source>J Am Med Inform Assoc</source>
<year>2023</year>
<volume>30</volume>
<issue>3</issue>
<fpage>411</fpage>
<lpage>7</lpage>
</element-citation></ref>
<ref id="b15-kjorl-hns-2026-00052">
<label>15</label>
<element-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Mallaband</surname><given-names>LJ</given-names></name>
</person-group>
<article-title>The agreement of phonetic transcriptions between paediatric speech and language therapists transcribing a disordered speech sample</article-title>
<source>Int J Lang Commun Disord</source>
<year>2024</year>
<volume>59</volume>
<issue>5</issue>
<fpage>1981</fpage>
<lpage>95</lpage>
</element-citation></ref></ref-list>

<sec sec-type="display-objects">
<title>Tables</title>

<table-wrap id="t1-kjorl-hns-2026-00052" position="float">
<label>Table 1.</label>
<caption><p>Sentence-level transcription agreement for each participant</p></caption>
<table rules="groups" frame="hsides">
<thead>
<tr>
<th align="center" valign="middle">Participant</th>
<th align="center" valign="middle">Total number of sentences</th>
<th align="center" valign="middle">AI sentence match rate (ignoring whitespace, %)</th>
<th align="center" valign="middle">AI sentence match rate (strict whitespace, %)</th>
<th align="center" valign="middle">Mean expert sentence match rate (ignoring whitespace, %)</th>
<th align="center" valign="middle">Mean expert sentence match rate (strict whitespace, %)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" valign="top">S-001</td>
<td align="center" valign="top">78<sup><xref rid="tfn1-kjorl-hns-2026-00052" ref-type="table-fn">*</xref></sup></td>
<td align="center" valign="top">38.5</td>
<td align="center" valign="top">28.2</td>
<td align="center" valign="top">&gt;90<sup><xref rid="tfn2-kjorl-hns-2026-00052" ref-type="table-fn">&#x02020;</xref></sup></td>
<td align="center" valign="top">&gt;85<sup><xref rid="tfn2-kjorl-hns-2026-00052" ref-type="table-fn">&#x02020;</xref></sup></td>
</tr>
<tr>
<td align="center" valign="top">S-002</td>
<td align="center" valign="top">80</td>
<td align="center" valign="top">71.2</td>
<td align="center" valign="top">63.7</td>
<td align="center" valign="top">&gt;95<sup><xref rid="tfn2-kjorl-hns-2026-00052" ref-type="table-fn">&#x02020;</xref></sup></td>
<td align="center" valign="top">&gt;90<sup><xref rid="tfn2-kjorl-hns-2026-00052" ref-type="table-fn">&#x02020;</xref></sup></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="tfn1-kjorl-hns-2026-00052"><label>*</label><p>two utterances were absent because the participant did not respond to some sentences;</p></fn>
<fn id="tfn2-kjorl-hns-2026-00052"><label>&#x02020;</label><p>expert transcribers produced identical or whitespace-only-differing transcriptions for most sentences, and inter-transcriber sentence agreement was estimated to be very high (predominantly &#x02265;90%).</p></fn>
</table-wrap-foot>
</table-wrap>

<table-wrap id="t2-kjorl-hns-2026-00052" position="float">
<label>Table 2.</label>
<caption><p>Word error rate (WER) and error composition by participant</p></caption>
<table rules="groups" frame="hsides">
<thead>
<tr>
<th align="center" valign="middle">Participant</th>
<th align="center" valign="middle">Reference word count (n)</th>
<th align="center" valign="middle">WER</th>
<th align="center" valign="middle">Substitutions, S (n, %)</th>
<th align="center" valign="middle">Deletions, D (n, %)</th>
<th align="center" valign="middle">Insertions, I (n, %)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" valign="top">S-001</td>
<td align="center" valign="top">335</td>
<td align="center" valign="top">0.2985</td>
<td align="center" valign="top">65 (19.40)</td>
<td align="center" valign="top">16 (4.80)</td>
<td align="center" valign="top">19 (5.70)</td>
</tr>
<tr>
<td align="center" valign="top">S-002</td>
<td align="center" valign="top">344</td>
<td align="center" valign="top">0.1802</td>
<td align="center" valign="top">46 (13.40)</td>
<td align="center" valign="top">6 (1.70)</td>
<td align="center" valign="top">10 (2.90)</td>
</tr>
</tbody>
</table>
</table-wrap>

<table-wrap id="t3-kjorl-hns-2026-00052" position="float">
<label>Table 3.</label>
<caption><p>Speech audiometry measures and Whisper transcription metrics in two elderly listeners</p></caption>
<table rules="groups" frame="hsides">
<thead>
<tr>
<th align="center" valign="middle">Participant</th>
<th align="center" valign="middle">KSA sentence recognition (%)</th>
<th align="center" valign="middle">KSA word recognition (%)</th>
<th align="center" valign="middle">SRT (dB HL)</th>
<th align="center" valign="middle">WRS (%)</th>
<th align="center" valign="middle">AI sentence match rate (ignoring whitespace, %)</th>
<th align="center" valign="middle">AI sentence match rate (strict whitespace, %)</th>
<th align="center" valign="middle">AI WER</th>
<th align="center" valign="middle">AI CER</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" valign="top">S-001</td>
<td align="center" valign="top">40</td>
<td align="center" valign="top">60</td>
<td align="center" valign="top">55</td>
<td align="center" valign="top">68</td>
<td align="center" valign="top">38.5</td>
<td align="center" valign="top">28.2</td>
<td align="center" valign="top">0.2985</td>
<td align="center" valign="top">0.1486</td>
</tr>
<tr>
<td align="center" valign="top">S-002</td>
<td align="center" valign="top">80</td>
<td align="center" valign="top">88</td>
<td align="center" valign="top">45</td>
<td align="center" valign="top">84</td>
<td align="center" valign="top">71.2</td>
<td align="center" valign="top">63.7</td>
<td align="center" valign="top">0.1802</td>
<td align="center" valign="top">0.0811</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>KSA, Korean Speech Audiometry; SRT, speech recognition threshold; WRS, word recognition score; WER, word error rate; CER, character error rate.</p></fn>
</table-wrap-foot>
</table-wrap>

</sec>
</back></article>