165

Speech Input

PreviousNext

A compact speech-to-text input component with real-time transcription using ElevenLabs Scribe.

<script setup lang="ts">
import {
  SpeechInput,
  SpeechInputCancelButton,
  SpeechInputPreview,
  SpeechInputRecordButton,
} from '@/components/elevenlabs-ui/speech-input'
import { Input } from '@/components/ui/input'
import { Textarea } from '@/components/ui/textarea'
import { ref } from 'vue'
import { toast } from 'vue-sonner'

async function getToken() {
  try {
    const response = await fetch('/api/get-scribe-token', {
      method: 'POST',
    })

    if (!response.ok) {
      throw new Error('Failed to get token')
    }

    const data = await response.json()
    if (data.error) {
      throw new Error(data.error)
    }

    return data.token
  }
  catch (error) {
    console.error(error)
    throw error
  }
}

// --- TextareaWithSpeechInputRight ---
const rightValue = ref('')
const rightValueAtStart = ref('')

function onStartRight() {
  rightValueAtStart.value = rightValue.value
}

function onChangeRight({ transcript }: { transcript: string }) {
  rightValue.value = rightValueAtStart.value + transcript
}

function onStopRight({ transcript }: { transcript: string }) {
  rightValue.value = rightValueAtStart.value + transcript
}

function onCancelRight() {
  rightValue.value = rightValueAtStart.value
}

function onError(error: Error | Event) {
  toast.error(String(error))
}

// --- TextareaWithSpeechInputLeft ---
const leftValue = ref('')
const leftValueAtStart = ref('')

function onStartLeft() {
  leftValueAtStart.value = leftValue.value
}

function onChangeLeft({ transcript }: { transcript: string }) {
  leftValue.value = leftValueAtStart.value + transcript
}

function onStopLeft({ transcript }: { transcript: string }) {
  leftValue.value = leftValueAtStart.value + transcript
}

function onCancelLeft() {
  leftValue.value = leftValueAtStart.value
}

// --- InputWithSpeechInput ---
const inputValue = ref('')
const inputValueAtStart = ref('')

function onStartInput() {
  inputValueAtStart.value = inputValue.value
}

function onChangeInput({ transcript }: { transcript: string }) {
  inputValue.value = inputValueAtStart.value + transcript
}

function onStopInput({ transcript }: { transcript: string }) {
  inputValue.value = inputValueAtStart.value + transcript
}

function onCancelInput() {
  inputValue.value = inputValueAtStart.value
}
</script>

<template>
  <div class="absolute inset-0 space-y-4 overflow-auto rounded-2xl p-10">
    <!-- TextareaWithSpeechInputRight -->
    <div class="relative">
      <Textarea
        v-model="rightValue"
        placeholder="Jot down some thoughts..."
        class="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div class="absolute right-3 bottom-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          :get-token="getToken"
          @start="onStartRight"
          @change="onChangeRight"
          @stop="onStopRight"
          @cancel="onCancelRight"
          @error="onError"
        >
          <SpeechInputCancelButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputRecordButton />
        </SpeechInput>
      </div>
    </div>

    <!-- TextareaWithSpeechInputLeft -->
    <div class="relative">
      <Textarea
        v-model="leftValue"
        placeholder="Jot down some thoughts..."
        class="min-h-[120px] resize-none rounded-2xl px-3.5 pt-3 pb-14"
      />
      <div class="absolute bottom-3 left-3 flex items-center gap-2">
        <SpeechInput
          size="sm"
          :get-token="getToken"
          @start="onStartLeft"
          @change="onChangeLeft"
          @stop="onStopLeft"
          @cancel="onCancelLeft"
          @error="onError"
        >
          <SpeechInputRecordButton />
          <SpeechInputPreview placeholder="Listening..." />
          <SpeechInputCancelButton />
        </SpeechInput>
      </div>
    </div>

    <!-- InputWithSpeechInput -->
    <div class="flex items-center gap-2.5">
      <Input
        v-model="inputValue"
        placeholder="Give this idea a title..."
        class="min-w-0 flex-1 px-3.5 text-base transition-[flex-basis] duration-200 md:text-sm"
      />
      <SpeechInput
        class="shrink-0"
        :get-token="getToken"
        @start="onStartInput"
        @change="onChangeInput"
        @stop="onStopInput"
        @cancel="onCancelInput"
        @error="onError"
      >
        <SpeechInputCancelButton />
        <SpeechInputRecordButton />
      </SpeechInput>
    </div>
  </div>
</template>

Installation

pnpm dlx elevenlabs-ui-vue@latest add speech-input

Usage

import {
SpeechInput,
SpeechInputCancelButton,
SpeechInputPreview,
SpeechInputRecordButton,
} from "@/components/elevenlabs-ui/speech-input"

Basic Usage

<script setup lang="ts">
async function getToken() {
const response = await fetch("/api/get-scribe-token", { method: "POST" })
const json = await response.json()
return json.token
}

function handleChange(data: any) {
console.log(data.transcript)
}

function handleStop(data: any) {
console.log("Final:", data.transcript)
}
</script>

<template>
  <SpeechInput
    :getToken="getToken"
    @change="handleChange"
    @stop="handleStop"
  >
    <SpeechInputRecordButton />
    <SpeechInputPreview placeholder="Start speaking..." />
    <SpeechInputCancelButton />
  </SpeechInput>
</template>

With Form Input

<script setup lang="ts">
import { ref } from "vue"

const value = ref("")

function handleStop(data: any) {
value.value = value.value + " " + data.transcript
}
</script>

<template>
  <div class="flex items-center gap-2">
    <input
      v-model="value"
      class="flex-1 rounded border px-3 py-2"
    />
    <SpeechInput
      :getToken="getToken"
      @stop="handleStop"
    >
      <SpeechInputRecordButton />
      <SpeechInputPreview />
      <SpeechInputCancelButton />
    </SpeechInput>
  </div>
</template>

Reversed Layout

The component automatically adjusts its layout based on child order:

<template>
  <SpeechInput :getToken="getToken">
    <SpeechInputCancelButton />
    <SpeechInputPreview />
    <SpeechInputRecordButton />
  </SpeechInput>
</template>

Minimal (Record Button Only)

<template>
  <SpeechInput
    :getToken="getToken"
    @stop="handleStop"
  >
    <SpeechInputRecordButton />
  </SpeechInput>
</template>

Custom Placeholder

<template>
  <SpeechInput :getToken="getToken">
    <SpeechInputRecordButton />
    <SpeechInputPreview placeholder="Say something..." />
    <SpeechInputCancelButton />
  </SpeechInput>
</template>

Using the Composable

Access the speech input context in child components:

<script setup lang="ts">
import { useSpeechInput } from "@/components/elevenlabs-ui/speech-input"

const { transcript, isConnected, isConnecting } = useSpeechInput()
</script>

<template>
<div>
  <p>
    Status:
    <span v-if="isConnecting">Connecting</span>
    <span v-else-if="isConnected">Recording</span>
    <span v-else>Idle</span>
  </p>
  <p>Transcript: {{ transcript }}</p>
</div>
</template>

Server Route for Token

Create a server action to securely fetch the Scribe token:

server/api/get-scribe-token.post.ts
export default defineEventHandler(async () => {
const response = await fetch(
  "https://api.elevenlabs.io/v1/speech-to-text/get-realtime-token",
  {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "xi-api-key": process.env.ELEVENLABS_API_KEY as string,
    },
    body: JSON.stringify({
      model_id: "scribe_v2_realtime",
      ttl_secs: 300,
    }),
  }
)

const data = await response.json()
return { token: data.token }
})

API Reference

SpeechInput

The root component that manages speech-to-text state and provides context to child components.

Props

PropTypeDefaultDescription
getToken() => Promise<string>-Function to fetch ElevenLabs Scribe token
modelIdstring"scribe_v2_realtime"ElevenLabs model ID
baseUristring-Custom WebSocket base URI
commitStrategyCommitStrategy"vad"How transcripts are committed ("manual" or "vad")
vadSilenceThresholdSecsnumber-VAD silence threshold (0.3-3.0)
vadThresholdnumber-VAD threshold (0.1-0.9)
minSpeechDurationMsnumber-Minimum speech duration (50-2000ms)
minSilenceDurationMsnumber-Minimum silence duration (50-2000ms)
languageCodestring-ISO-639-1/3 language code
microphoneMicrophoneOptionsSee belowMicrophone configuration
audioFormatAudioFormat-Audio format for manual streaming
sampleRatenumber-Sample rate for manual streaming
classstring-Optional CSS classes

Emits

EventTypeDescription
change(data: SpeechInputData) => voidCalled when transcript changes
start(data: SpeechInputData) => voidCalled when recording starts
stop(data: SpeechInputData) => voidCalled when recording stops
cancel(data: SpeechInputData) => voidCalled when recording is cancelled
error(error: Error | Event) => voidCalled on connection errors
authError(data: { error: string }) => voidCalled on authentication errors
quotaExceededError(data: { error: string }) => voidCalled when quota is exceeded

Default Microphone Options

{
echoCancellation: true,
noiseSuppression: true
}

SpeechInputRecordButton

Toggle button that switches between microphone icon (idle), connecting indicator, and stop icon (recording).

Props

PropTypeDescription
classstringOptional CSS classes
disabledbooleanDisable the button
...propsInstanceType<typeof Button>All button props

Emits

EventTypeDescription
click(e: MouseEvent) => voidAdditional click handler

SpeechInputPreview

Displays the current transcript with smooth text animations.

Props

PropTypeDefaultDescription
placeholderstring"Listening..."Text shown when empty
classstring-Optional CSS classes
...propsHTMLDivElement-All div props

SpeechInputCancelButton

Button to cancel the current recording and clear the transcript.

Props

PropTypeDescription
classstringOptional CSS classes
...propsHTMLButtonElementAll button props

Emits

EventTypeDescription
click(e: MouseEvent) => voidAdditional click handler

useSpeechInput

Composable to access speech input context from child components.

Returns

PropertyTypeDescription
isConnectedbooleanWhether currently connected/recording
isConnectingbooleanWhether connection is in progress
transcriptstringFull transcript (committed + partial)
partialTranscriptstringCurrent partial transcript
committedTranscriptsstring[]Array of committed transcripts
errorstring | nullCurrent error message
start() => Promise<void>Start recording
stop() => voidStop recording
cancel() => voidCancel and clear transcript

SpeechInputData

Data object passed to callbacks.

interface SpeechInputData {
partialTranscript: string
committedTranscripts: string[]
transcript: string // Combined full transcript
}

CommitStrategy

enum CommitStrategy {
MANUAL = "manual",
VAD = "vad",
}

AudioFormat

enum AudioFormat {
PCM_8000 = "pcm_8000",
PCM_16000 = "pcm_16000",
PCM_22050 = "pcm_22050",
PCM_24000 = "pcm_24000",
PCM_44100 = "pcm_44100",
PCM_48000 = "pcm_48000",
ULAW_8000 = "ulaw_8000",
}

Features

  • Real-time Transcription: Live speech-to-text using ElevenLabs Scribe
  • Compound Components: Flexible composition with record button, preview, and cancel
  • Animated Transitions: Smooth expand/collapse animations using motion-v
  • Voice Activity Detection: Automatic transcript commits based on speech pauses
  • Visual Feedback: Distinct states for idle, connecting, and recording
  • Accessibility: Proper ARIA labels and keyboard interaction

Notes

  • Requires an ElevenLabs API key for generating Scribe tokens
  • Token generation should happen server-side to protect your API key
  • The component automatically handles microphone permissions
  • Uses WebSocket for real-time communication with ElevenLabs Scribe API
  • VAD (Voice Activity Detection) mode automatically commits transcripts during pauses
  • The preview component uses a gradient mask for text overflow
  • Layout automatically adjusts based on whether the record button is first or last