Add professional voice assistant server implementation

- FastAPI-based TTS server using Piper neural text-to-speech
- Poetry for dependency management and virtual environments
- OpenAI-compatible API endpoints for seamless integration
- Support for multiple voice models (Ryan, Alan, Lessac)
- Robust error handling and voice fallback system
- Professional logging and configuration management
- Docker-ready with proper Python packaging

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-08-17 14:56:01 +02:00
parent 82f9cc4990
commit 572434d42e
13 changed files with 1722 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""Homelab Voice Server - Local TTS server for Claude Code."""
__version__ = "0.1.0"
__author__ = "Homelab"
__description__ = "Local TTS server using Piper for Claude Code voice assistant"
from .config import config
from .tts import TTSService
from .api import app
__all__ = ["config", "TTSService", "app"]

View File

@@ -0,0 +1,169 @@
"""FastAPI application for voice server."""
import logging
from typing import Optional
from fastapi import FastAPI, HTTPException, Response
from pydantic import BaseModel, Field
from .tts import TTSService
from .config import config
# Configure logging
logging.basicConfig(level=getattr(logging, config.log_level.upper()))
logger = logging.getLogger(__name__)
# Initialize TTS service
try:
tts_service = TTSService()
except Exception as e:
logger.error(f"Failed to initialize TTS service: {e}")
tts_service = None
app = FastAPI(
title="Homelab Voice Server",
description="Local TTS server for Claude Code voice assistant using Piper",
version="0.1.0"
)
class TTSRequest(BaseModel):
"""Request model for TTS synthesis."""
input: str = Field(..., description="Text to synthesize")
model: str = Field(default="tts-1", description="Model to use (for compatibility)")
voice: str = Field(default="alloy", description="Voice to use")
response_format: str = Field(default="mp3", description="Audio format (ignored, always returns wav)")
speed: float = Field(default=1.0, ge=0.25, le=4.0, description="Speech speed")
class ModelInfo(BaseModel):
"""Model information."""
id: str
object: str = "model"
created: int = 1677649963
owned_by: str = "piper"
class ModelsResponse(BaseModel):
"""Response for models endpoint."""
object: str = "list"
data: list[ModelInfo]
@app.get("/health")
async def health_check():
"""Health check endpoint."""
if tts_service is None:
raise HTTPException(status_code=503, detail="TTS service not available")
return {
"status": "healthy",
"tts_available": True,
"default_voice": config.default_voice,
"voices_available": len(config.available_voices)
}
@app.get("/v1/models", response_model=ModelsResponse)
async def list_models():
"""List available models (OpenAI compatible)."""
return ModelsResponse(
object="list",
data=[
ModelInfo(id="tts-1", owned_by="piper"),
ModelInfo(id="tts-1-hd", owned_by="piper")
]
)
@app.get("/v1/voices")
async def list_voices():
"""List available voices."""
if tts_service is None:
raise HTTPException(status_code=503, detail="TTS service not available")
return {"voices": tts_service.list_voices()}
@app.get("/v1/voices/{voice_name}")
async def get_voice_info(voice_name: str):
"""Get information about a specific voice."""
if tts_service is None:
raise HTTPException(status_code=503, detail="TTS service not available")
try:
voice_info = tts_service.get_voice_info(voice_name)
return voice_info
except ValueError as e:
raise HTTPException(status_code=404, detail=str(e))
@app.post("/v1/audio/speech")
async def create_speech(request: TTSRequest):
"""
Create speech from text (OpenAI compatible).
Returns raw audio data as wav format.
"""
if tts_service is None:
raise HTTPException(status_code=503, detail="TTS service not available")
# Map common voice names to our voices
voice_mapping = {
# OpenAI voices
"alloy": config.default_voice,
"echo": config.default_voice,
"fable": config.default_voice,
"onyx": config.default_voice,
"nova": "lessac", # Female voice
"shimmer": "lessac", # Female voice
# Common defaults
"default": config.default_voice,
"male": config.default_voice,
"female": "lessac"
}
# Get voice name, with fallback to default
voice_name = voice_mapping.get(request.voice, request.voice)
# If the requested voice doesn't exist in our available voices, use default
if voice_name not in config.available_voices:
logger.warning(f"Requested voice '{voice_name}' not available, using default: {config.default_voice}")
voice_name = config.default_voice
try:
audio_data, audio_format = tts_service.synthesize(
text=request.input,
voice=voice_name,
speed=request.speed
)
# Return raw audio data
return Response(
content=audio_data,
media_type="audio/wav",
headers={
"Content-Disposition": "attachment; filename=speech.wav"
}
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
logger.error(f"TTS synthesis failed: {e}")
raise HTTPException(status_code=500, detail=f"TTS synthesis failed: {e}")
@app.get("/")
async def root():
"""Root endpoint with API information."""
return {
"service": "Homelab Voice Server",
"version": "0.1.0",
"description": "Local TTS server using Piper",
"endpoints": {
"health": "/health",
"models": "/v1/models",
"voices": "/v1/voices",
"speech": "/v1/audio/speech"
},
"default_voice": config.default_voice,
"available_voices": list(config.available_voices.keys()) if tts_service else []
}

View File

@@ -0,0 +1,90 @@
"""Configuration for the voice server."""
import os
from pathlib import Path
from typing import Dict, Any
from pydantic_settings import BaseSettings
from pydantic import Field
class VoiceServerConfig(BaseSettings):
"""Voice server configuration."""
host: str = Field(default="127.0.0.1", description="Server host")
port: int = Field(default=8880, description="Server port")
# Voice model configuration
default_voice: str = Field(default="ryan", description="Default voice model")
voices_dir: Path = Field(
default_factory=lambda: Path.home() / ".local/share/piper-voices",
description="Directory containing voice models"
)
# Available voice models
available_voices: Dict[str, Dict[str, Any]] = Field(
default_factory=lambda: {
"ryan": {
"model_file": "en_US-ryan-medium.onnx",
"config_file": "en_US-ryan-medium.onnx.json",
"language": "en-US",
"gender": "male",
"description": "Professional US male voice"
},
"alan": {
"model_file": "en_GB-alan-medium.onnx",
"config_file": "en_GB-alan-medium.onnx.json",
"language": "en-GB",
"gender": "male",
"description": "Sophisticated British male voice"
},
"lessac": {
"model_file": "en_US-lessac-medium.onnx",
"config_file": "en_US-lessac-medium.onnx.json",
"language": "en-US",
"gender": "female",
"description": "Natural US female voice"
}
}
)
# Piper TTS configuration
piper_executable: str = Field(default="piper-tts", description="Piper TTS executable")
audio_format: str = Field(default="wav", description="Audio output format")
# Server configuration
log_level: str = Field(default="info", description="Logging level")
class Config:
env_prefix = "VOICE_SERVER_"
env_file = ".env"
def get_voice_model_path(self, voice_name: str = None) -> Path:
"""Get the full path to a voice model file."""
voice_name = voice_name or self.default_voice
if voice_name not in self.available_voices:
raise ValueError(f"Voice '{voice_name}' not found in available voices")
voice_config = self.available_voices[voice_name]
return self.voices_dir / voice_config["model_file"]
def get_voice_config_path(self, voice_name: str = None) -> Path:
"""Get the full path to a voice config file."""
voice_name = voice_name or self.default_voice
if voice_name not in self.available_voices:
raise ValueError(f"Voice '{voice_name}' not found in available voices")
voice_config = self.available_voices[voice_name]
return self.voices_dir / voice_config["config_file"]
def validate_voice_files(self, voice_name: str = None) -> bool:
"""Check if voice model files exist."""
voice_name = voice_name or self.default_voice
try:
model_path = self.get_voice_model_path(voice_name)
config_path = self.get_voice_config_path(voice_name)
return model_path.exists() and config_path.exists()
except ValueError:
return False
# Global configuration instance
config = VoiceServerConfig()

View File

@@ -0,0 +1,82 @@
"""Main entry point for the voice server."""
import logging
import sys
from pathlib import Path
import uvicorn
from .config import config
from .api import app
logger = logging.getLogger(__name__)
def check_prerequisites():
"""Check if all prerequisites are met."""
errors = []
# Check if voices directory exists
if not config.voices_dir.exists():
errors.append(f"Voices directory not found: {config.voices_dir}")
errors.append("Run: mkdir -p ~/.local/share/piper-voices")
# Check if default voice files exist
if not config.validate_voice_files():
voice_name = config.default_voice
model_path = config.get_voice_model_path()
errors.append(f"Default voice '{voice_name}' files not found")
errors.append(f"Expected model at: {model_path}")
errors.append("Download voice models from: https://huggingface.co/rhasspy/piper-voices")
# Check available voices
available_count = sum(
1 for voice in config.available_voices
if config.validate_voice_files(voice)
)
if available_count == 0:
errors.append("No voice models available")
errors.append("Please download at least one voice model")
return errors
def main():
"""Main entry point."""
# Set up logging
logging.basicConfig(
level=getattr(logging, config.log_level.upper()),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger.info("Starting Homelab Voice Server")
logger.info(f"Configuration: {config.dict()}")
# Check prerequisites
errors = check_prerequisites()
if errors:
logger.error("Prerequisites not met:")
for error in errors:
logger.error(f" - {error}")
sys.exit(1)
# Log available voices
available_voices = [
voice for voice in config.available_voices
if config.validate_voice_files(voice)
]
logger.info(f"Available voices: {available_voices}")
logger.info(f"Default voice: {config.default_voice}")
# Start server
logger.info(f"Starting server on {config.host}:{config.port}")
uvicorn.run(
app,
host=config.host,
port=config.port,
log_level=config.log_level,
access_log=True
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,158 @@
"""Text-to-speech service using Piper."""
import subprocess
import tempfile
import os
import logging
from pathlib import Path
from typing import Optional, Tuple
from .config import config
logger = logging.getLogger(__name__)
class TTSService:
"""Text-to-speech service using Piper."""
def __init__(self):
self.config = config
self._validate_setup()
def _validate_setup(self):
"""Validate that piper and voice models are available."""
# Check if piper-tts is available
try:
result = subprocess.run(
[self.config.piper_executable, "--help"],
capture_output=True,
timeout=10
)
if result.returncode != 0:
raise RuntimeError(f"Piper TTS not working: {result.stderr.decode()}")
except (subprocess.TimeoutExpired, FileNotFoundError) as e:
raise RuntimeError(f"Piper TTS not found or not working: {e}")
# Check if default voice model exists
if not self.config.validate_voice_files():
default_voice = self.config.default_voice
model_path = self.config.get_voice_model_path()
raise RuntimeError(
f"Default voice '{default_voice}' model not found at {model_path}. "
f"Please download the voice model files."
)
logger.info(f"TTS service initialized with voice: {self.config.default_voice}")
def synthesize(
self,
text: str,
voice: Optional[str] = None,
speed: float = 1.0
) -> Tuple[bytes, str]:
"""
Synthesize text to speech.
Args:
text: Text to synthesize
voice: Voice to use (defaults to configured default)
speed: Speech speed multiplier
Returns:
Tuple of (audio_data, audio_format)
Raises:
ValueError: If voice is not available
RuntimeError: If synthesis fails
"""
voice = voice or self.config.default_voice
if not self.config.validate_voice_files(voice):
available_voices = list(self.config.available_voices.keys())
raise ValueError(
f"Voice '{voice}' not available. Available voices: {available_voices}"
)
model_path = self.config.get_voice_model_path(voice)
# Create temporary file for output
with tempfile.NamedTemporaryFile(suffix=f".{self.config.audio_format}", delete=False) as temp_file:
temp_path = temp_file.name
try:
# Build piper command
cmd = [
self.config.piper_executable,
"-m", str(model_path),
"-f", temp_path
]
# Add speed if different from default
if speed != 1.0:
cmd.extend(["--length-scale", str(1.0 / speed)])
logger.debug(f"Running piper command: {' '.join(cmd)}")
# Run piper-tts
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
stdout, stderr = process.communicate(input=text, timeout=30)
if process.returncode != 0:
raise RuntimeError(f"TTS synthesis failed: {stderr}")
# Read the generated audio file
with open(temp_path, "rb") as f:
audio_data = f.read()
if not audio_data:
raise RuntimeError("Generated audio file is empty")
logger.info(f"Successfully synthesized {len(text)} characters with voice '{voice}'")
return audio_data, self.config.audio_format
except subprocess.TimeoutExpired:
process.kill()
raise RuntimeError("TTS synthesis timed out")
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
raise
finally:
# Clean up temp file
try:
os.unlink(temp_path)
except OSError:
pass
def list_voices(self) -> dict:
"""List available voices with their information."""
voices = {}
for voice_name, voice_config in self.config.available_voices.items():
voices[voice_name] = {
"name": voice_name,
"language": voice_config["language"],
"gender": voice_config["gender"],
"description": voice_config["description"],
"available": self.config.validate_voice_files(voice_name)
}
return voices
def get_voice_info(self, voice_name: str) -> dict:
"""Get information about a specific voice."""
if voice_name not in self.config.available_voices:
raise ValueError(f"Voice '{voice_name}' not found")
voice_config = self.config.available_voices[voice_name]
return {
"name": voice_name,
"language": voice_config["language"],
"gender": voice_config["gender"],
"description": voice_config["description"],
"available": self.config.validate_voice_files(voice_name),
"model_path": str(self.config.get_voice_model_path(voice_name)),
"config_path": str(self.config.get_voice_config_path(voice_name))
}