Speech Input

A compact speech-to-text input component with real-time transcription using ElevenLabs Scribe.

<script setup lang="ts">
import {
  SpeechInput,
  SpeechInputCancelButton,
  SpeechInputPreview,
  SpeechInputRecordButton,
} from '@/components/elevenlabs-ui/speech-input'
import { Input } from '@/components/ui/input'
import { Textarea } from '@/components/ui/textarea'
import { ref } from 'vue'
import { toast } from 'vue-sonner'

async function getToken() {
  try {
    const response = await fetch('/api/get-scribe-token', {
      method: 'POST',
    })

    if (!response.ok) {
      throw new Error('Failed to get token')
    }

    const data = await response.json()
    if (data.error) {
      throw new Error(data.error)
    }

    return data.token
  }
  catch (error) {
    console.error(error)
    throw error
  }
}

// --- TextareaWithSpeechInputRight ---
const rightValue = ref('')
const rightValueAtStart = ref('')

function onStartRight() {
  rightValueAtStart.value = rightValue.value
}

function onChangeRight({ transcript }: { transcript: string }) {
  rightValue.value = rightValueAtStart.value + transcript
}

function onStopRight({ transcript }: { transcript: string }) {
  rightValue.value = rightValueAtStart.value + transcript
}

function onCancelRight() {
  rightValue.value = rightValueAtStart.value
}

function onError(error: Error | Event) {
  toast.error(String(error))
}

// --- TextareaWithSpeechInputLeft ---
const leftValue = ref('')
const leftValueAtStart = ref('')

function onStartLeft() {
  leftValueAtStart.value = leftValue.value
}

function onChangeLeft({ transcript }: { transcript: string }) {
  leftValue.value = leftValueAtStart.value + transcript
}

function onStopLeft({ transcript }: { transcript: string }) {
  leftValue.value = leftValueAtStart.value + transcript
}

function onCancelLeft() {
  leftValue.value = leftValueAtStart.value
}

// --- InputWithSpeechInput ---
const inputValue = ref('')
const inputValueAtStart = ref('')

function onStartInput() {
  inputValueAtStart.value = inputValue.value
}

function onChangeInput({ transcript }: { transcript: string }) {
  inputValue.value = inputValueAtStart.value + transcript
}

function onStopInput({ transcript }: { transcript: string }) {
  inputValue.value = inputValueAtStart.value + transcript
}

function onCancelInput() {
  inputValue.value = inputValueAtStart.value
}
</script>

<template>
  <div class="absolute inset-0 space-y-4 overflow-auto rounded-2xl p-10">
    <!-- TextareaWithSpeechInputRight -->
    <div class="relative">
      <Textarea
        v-model="rightValue"
        placeholder="Jot down some thoughts..."
        class="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div class="absolute right-3 bottom-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          :get-token="getToken"
          @start="onStartRight"
          @change="onChangeRight"
          @stop="onStopRight"
          @cancel="onCancelRight"
          @error="onError"
        >
          <SpeechInputCancelButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputRecordButton />
        </SpeechInput>
      </div>
    </div>

    <!-- TextareaWithSpeechInputLeft -->
    <div class="relative">
      <Textarea
        v-model="leftValue"
        placeholder="Jot down some thoughts..."
        class="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div class="absolute bottom-3 left-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          :get-token="getToken"
          @start="onStartLeft"
          @change="onChangeLeft"
          @stop="onStopLeft"
          @cancel="onCancelLeft"
          @error="onError"
        >
          <SpeechInputRecordButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputCancelButton />
        </SpeechInput>
      </div>
    </div>

    <!-- InputWithSpeechInput -->
    <div class="flex items-center gap-2.5">
      <Input
        v-model="inputValue"
        placeholder="Give this idea a title..."
        class="min-w-0 flex-1 px-3.5 text-base transition-[flex-basis] duration-200 md:text-sm"
      />
      <SpeechInput
        class="shrink-0"
        :get-token="getToken"
        @start="onStartInput"
        @change="onChangeInput"
        @stop="onStopInput"
        @cancel="onCancelInput"
        @error="onError"
      >
        <SpeechInputCancelButton />
        <SpeechInputRecordButton />
      </SpeechInput>
    </div>
  </div>
</template>

Installation


pnpm dlx elevenlabs-ui-vue@latest add speech-input

Usage

import {
SpeechInput,
SpeechInputCancelButton,
SpeechInputPreview,
SpeechInputRecordButton,
} from "@/components/elevenlabs-ui/speech-input"

Basic Usage

<script setup lang="ts">
async function getToken() {
const response = await fetch("/api/get-scribe-token", { method: "POST" })
const json = await response.json()
return json.token
}

function handleChange(data: any) {
console.log(data.transcript)
}

function handleStop(data: any) {
console.log("Final:", data.transcript)
}
</script>

<template>
  <SpeechInput
    :getToken="getToken"
    @change="handleChange"
    @stop="handleStop"
  >
    <SpeechInputRecordButton />
    <SpeechInputPreview placeholder="Start speaking..." />
    <SpeechInputCancelButton />
  </SpeechInput>
</template>

With Form Input

<script setup lang="ts">
import { ref } from "vue"

const value = ref("")

function handleStop(data: any) {
value.value = value.value + " " + data.transcript
}
</script>

<template>
  <div class="flex items-center gap-2">
    <input
      v-model="value"
      class="flex-1 rounded border px-3 py-2"
    />
    <SpeechInput
      :getToken="getToken"
      @stop="handleStop"
    >
      <SpeechInputRecordButton />
      <SpeechInputPreview />
      <SpeechInputCancelButton />
    </SpeechInput>
  </div>
</template>

Reversed Layout

The component automatically adjusts its layout based on child order:

<template>
  <SpeechInput :getToken="getToken">
    <SpeechInputCancelButton />
    <SpeechInputPreview />
    <SpeechInputRecordButton />
  </SpeechInput>
</template>

Minimal (Record Button Only)

<template>
  <SpeechInput
    :getToken="getToken"
    @stop="handleStop"
  >
    <SpeechInputRecordButton />
  </SpeechInput>
</template>

Custom Placeholder

<template>
  <SpeechInput :getToken="getToken">
    <SpeechInputRecordButton />
    <SpeechInputPreview placeholder="Say something..." />
    <SpeechInputCancelButton />
  </SpeechInput>
</template>

Using the Composable

Access the speech input context in child components:

<script setup lang="ts">
import { useSpeechInput } from "@/components/elevenlabs-ui/speech-input"

const { transcript, isConnected, isConnecting } = useSpeechInput()
</script>

<template>
<div>
  <p>
    Status:
    <span v-if="isConnecting">Connecting</span>
    <span v-else-if="isConnected">Recording</span>
    <span v-else>Idle</span>
  </p>
  <p>Transcript: {{ transcript }}</p>
</div>
</template>

Server Route for Token

Create a server action to securely fetch the Scribe token:

server/api/get-scribe-token.post.ts

export default defineEventHandler(async () => {
const response = await fetch(
  "https://api.elevenlabs.io/v1/speech-to-text/get-realtime-token",
  {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "xi-api-key": process.env.ELEVENLABS_API_KEY as string,
    },
    body: JSON.stringify({
      model_id: "scribe_v2_realtime",
      ttl_secs: 300,
    }),
  }
)

const data = await response.json()
return { token: data.token }
})

API Reference

SpeechInput

The root component that manages speech-to-text state and provides context to child components.

Props

Prop	Type	Default	Description
getToken	`() => Promise<string>`	-	Function to fetch ElevenLabs Scribe token
modelId	`string`	`"scribe_v2_realtime"`	ElevenLabs model ID
baseUri	`string`	-	Custom WebSocket base URI
commitStrategy	`CommitStrategy`	`"vad"`	How transcripts are committed (`"manual"` or `"vad"`)
vadSilenceThresholdSecs	`number`	-	VAD silence threshold (0.3-3.0)
vadThreshold	`number`	-	VAD threshold (0.1-0.9)
minSpeechDurationMs	`number`	-	Minimum speech duration (50-2000ms)
minSilenceDurationMs	`number`	-	Minimum silence duration (50-2000ms)
languageCode	`string`	-	ISO-639-1/3 language code
microphone	`MicrophoneOptions`	See below	Microphone configuration
audioFormat	`AudioFormat`	-	Audio format for manual streaming
sampleRate	`number`	-	Sample rate for manual streaming
class	`string`	-	Optional CSS classes

Emits

Event	Type	Description
change	`(data: SpeechInputData) => void`	Called when transcript changes
start	`(data: SpeechInputData) => void`	Called when recording starts
stop	`(data: SpeechInputData) => void`	Called when recording stops
cancel	`(data: SpeechInputData) => void`	Called when recording is cancelled
error	`(error: Error \| Event) => void`	Called on connection errors
authError	`(data: { error: string }) => void`	Called on authentication errors
quotaExceededError	`(data: { error: string }) => void`	Called when quota is exceeded

Default Microphone Options

{
echoCancellation: true,
noiseSuppression: true
}

SpeechInputRecordButton

Toggle button that switches between microphone icon (idle), connecting indicator, and stop icon (recording).

Props

Prop	Type	Description
class	`string`	Optional CSS classes
disabled	`boolean`	Disable the button
...props	`InstanceType<typeof Button>`	All button props

Emits

Event	Type	Description
click	`(e: MouseEvent) => void`	Additional click handler

SpeechInputPreview

Displays the current transcript with smooth text animations.

Props

Prop	Type	Default	Description
placeholder	`string`	`"Listening..."`	Text shown when empty
class	`string`	-	Optional CSS classes
...props	`HTMLDivElement`	-	All div props

SpeechInputCancelButton

Button to cancel the current recording and clear the transcript.

Props

Prop	Type	Description
class	`string`	Optional CSS classes
...props	`HTMLButtonElement`	All button props

Emits

Event	Type	Description
click	`(e: MouseEvent) => void`	Additional click handler

useSpeechInput

Composable to access speech input context from child components.

Returns

Property	Type	Description
isConnected	`boolean`	Whether currently connected/recording
isConnecting	`boolean`	Whether connection is in progress
transcript	`string`	Full transcript (committed + partial)
partialTranscript	`string`	Current partial transcript
committedTranscripts	`string[]`	Array of committed transcripts
error	`string \| null`	Current error message
start	`() => Promise<void>`	Start recording
stop	`() => void`	Stop recording
cancel	`() => void`	Cancel and clear transcript

SpeechInputData

Data object passed to callbacks.

interface SpeechInputData {
partialTranscript: string
committedTranscripts: string[]
transcript: string // Combined full transcript
}

CommitStrategy

enum CommitStrategy {
MANUAL = "manual",
VAD = "vad",
}

AudioFormat

enum AudioFormat {
PCM_8000 = "pcm_8000",
PCM_16000 = "pcm_16000",
PCM_22050 = "pcm_22050",
PCM_24000 = "pcm_24000",
PCM_44100 = "pcm_44100",
PCM_48000 = "pcm_48000",
ULAW_8000 = "ulaw_8000",
}

Features

Real-time Transcription: Live speech-to-text using ElevenLabs Scribe
Compound Components: Flexible composition with record button, preview, and cancel
Animated Transitions: Smooth expand/collapse animations using motion-v
Voice Activity Detection: Automatic transcript commits based on speech pauses
Visual Feedback: Distinct states for idle, connecting, and recording
Accessibility: Proper ARIA labels and keyboard interaction

Notes

Requires an ElevenLabs API key for generating Scribe tokens
Token generation should happen server-side to protect your API key
The component automatically handles microphone permissions
Uses WebSocket for real-time communication with ElevenLabs Scribe API
VAD (Voice Activity Detection) mode automatically commits transcripts during pauses
The preview component uses a gradient mask for text overflow
Layout automatically adjusts based on whether the record button is first or last

Shimmering Text Transcript Viewer