finish basic voice assistant

This commit is contained in:
Nitwel 2024-03-15 18:27:03 +01:00
parent 30d3ef6004
commit aff66884ca
14 changed files with 328 additions and 20 deletions

3
assets/chat_bubble.blend Normal file
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c5fa5f006a42d87f43a8b411e4e4bf64a8b4fbbaedd0d02579134a8fa59161eb
size 894176

BIN
assets/chat_bubble.blend1 Normal file

Binary file not shown.

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f215158ae0aba0113e0077040342fc1b508cfec0a3a1e022c9ce0c16973e2ee1
size 17828

View File

@ -0,0 +1,34 @@
[remap]
importer="scene"
importer_version=1
type="PackedScene"
uid="uid://b12raorbby1xd"
path="res://.godot/imported/chat_bubble.glb-03622c64b96f5698360bcfb8a4904483.scn"
[deps]
source_file="res://assets/models/chat_bubble/chat_bubble.glb"
dest_files=["res://.godot/imported/chat_bubble.glb-03622c64b96f5698360bcfb8a4904483.scn"]
[params]
nodes/root_type=""
nodes/root_name=""
nodes/apply_root_scale=true
nodes/root_scale=1.0
meshes/ensure_tangents=true
meshes/generate_lods=true
meshes/create_shadow_meshes=true
meshes/light_baking=1
meshes/lightmap_texel_size=0.2
meshes/force_disable_compression=false
skins/use_named_skins=true
animation/import=true
animation/fps=30
animation/trimming=false
animation/remove_immutable_tracks=true
import_script/path=""
_subresources={}
gltf/naming_version=1
gltf/embedded_image_handling=1

View File

@ -85,5 +85,6 @@ transform = Transform3D(0.499999, -0.000139169, -6.50204e-05, 5.24307e-05, 0.353
[node name="House" parent="." instance=ExtResource("9_np6mw")]
[node name="Assist" parent="." instance=ExtResource("12_8av8q")]
transform = Transform3D(1, -1.39636e-11, 0, 9.47986e-12, 1, 0, 0, 0, 1, 0.000231838, -4.01369e-06, -0.855612)
[editable path="XROrigin3D/XRControllerLeft"]

View File

@ -1,24 +1,79 @@
extends Node3D
const sample_hold = preload ("res://lib/utils/sample_hold.gd")
const Chat = preload ("./chat.gd")
const audio_freq = 44100
const target_freq = 16000
const sample_rate_ratio: float = audio_freq / target_freq * 1.5
var effect: AudioEffectCapture
@export var input_threshold: float = 0.05
@export var input_threshold: float = 0.1
@onready var audio_recorder: AudioStreamPlayer = $AudioStreamRecord
@onready var timer: Timer = $Timer
@onready var audio_timer: Timer = $AudioTimer
@onready var visual_timer: Timer = $VisualTimer
@onready var audio_player_3d: AudioStreamPlayer3D = $AudioStreamPlayer3D
@onready var chat_user: Chat = $ChatUser
@onready var chat_assistant: Chat = $ChatAssistant
@onready var loader: Node3D = $Loader
@onready var camera = $"/root/Main/XROrigin3D/XRCamera3D"
var running := true
func _ready():
var index = AudioServer.get_bus_index("Record")
effect = AudioServer.get_bus_effect(index, 0)
timer.timeout.connect(func():
finish()
audio_timer.timeout.connect(func():
HomeApi.api.assist_handler.send_data(PackedByteArray())
)
HomeApi.api.assist_handler.on_wake_word.connect(func(text):
loader.visible=true
chat_user.visible=false
chat_assistant.visible=false
global_position=camera.global_position + camera.global_transform.basis.z * - 0.5
global_position.y *= 0.7
global_transform.basis=Basis.looking_at((camera.global_position - global_position) * - 1)
running=true
)
HomeApi.api.assist_handler.on_stt_message.connect(func(text):
loader.visible=false
chat_user.visible=true
chat_user.text=text
)
HomeApi.api.assist_handler.on_tts_message.connect(func(text):
chat_assistant.visible=true
chat_assistant.text=text
)
HomeApi.api.assist_handler.on_tts_sound.connect(func(audio):
print("Playing TTS ", audio.data.size())
audio_player_3d.stream=audio
audio_player_3d.play()
visual_timer.start()
running=false
)
visual_timer.timeout.connect(func():
if audio_player_3d.playing == false:
finish()
else:
await audio_player_3d.finished
finish()
)
func finish():
if running:
return
chat_user.visible = false
chat_assistant.visible = false
loader.visible = false
func _process(_delta):
var sterioData: PackedVector2Array = effect.get_buffer(effect.get_frames_available())
@ -41,10 +96,10 @@ func _process(_delta):
data.encode_s16(i * 2, int(value * 32767))
if max_amplitude > input_threshold:
if timer.is_stopped():
if audio_timer.is_stopped():
HomeApi.api.assist_handler.start_wakeword()
timer.start()
audio_timer.start()
if timer.is_stopped() == false:
if audio_timer.is_stopped() == false:
HomeApi.api.assist_handler.send_data(data)

View File

@ -1,6 +1,8 @@
[gd_scene load_steps=3 format=3 uid="uid://oydbwnek6xb4"]
[gd_scene load_steps=5 format=3 uid="uid://oydbwnek6xb4"]
[ext_resource type="Script" path="res://content/system/assist/assist.gd" id="1_5obhy"]
[ext_resource type="PackedScene" uid="uid://cy6jklyde3pgo" path="res://content/system/assist/chat.tscn" id="2_laew1"]
[ext_resource type="PackedScene" uid="uid://b0d1582vpkr8m" path="res://content/system/assist/loader.tscn" id="3_25iy1"]
[sub_resource type="AudioStreamMicrophone" id="AudioStreamMicrophone_6tv2x"]
@ -12,6 +14,22 @@ stream = SubResource("AudioStreamMicrophone_6tv2x")
autoplay = true
bus = &"Record"
[node name="Timer" type="Timer" parent="."]
[node name="AudioTimer" type="Timer" parent="."]
wait_time = 2.0
one_shot = true
[node name="AudioStreamPlayer3D" type="AudioStreamPlayer3D" parent="."]
[node name="VisualTimer" type="Timer" parent="."]
wait_time = 4.0
one_shot = true
[node name="ChatUser" parent="." instance=ExtResource("2_laew1")]
transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, -0.109997, 0.025, 0)
flip = false
[node name="ChatAssistant" parent="." instance=ExtResource("2_laew1")]
transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, -0.0499932, -0.025, 0)
text = "Hello, World!"
[node name="Loader" parent="." instance=ExtResource("3_25iy1")]

View File

@ -0,0 +1,37 @@
@tool
extends Node3D
const FontTools = preload ("res://lib/utils/font_tools.gd")
@onready var label: Label3D = $Label3D
@onready var chat: Skeleton3D = $chat_bubble/Armature/Skeleton3D
@onready var model: MeshInstance3D = $chat_bubble/Armature/Skeleton3D/Cube
@export var text := "Hello, World!":
set(value):
if !is_node_ready(): await ready
text = value
label.text = value
update()
@export var flip: bool = false:
set(value):
if !is_node_ready(): await ready
flip = value
model.rotation_degrees.x = -90 if value else 90
const base_width = 0.8 * 0.2
func update():
var text_width = FontTools.get_font_size(label).x
var offset = (text_width - base_width) / 0.2
offset = max(0.0, offset)
if flip:
offset = -offset
chat.set_bone_pose_position(1 if flip else 0, Vector3(0, offset, 0))

View File

@ -0,0 +1,33 @@
[gd_scene load_steps=5 format=3 uid="uid://cy6jklyde3pgo"]
[ext_resource type="PackedScene" uid="uid://b12raorbby1xd" path="res://assets/models/chat_bubble/chat_bubble.glb" id="1_lsdcs"]
[ext_resource type="Script" path="res://content/system/assist/chat.gd" id="1_rbrak"]
[ext_resource type="Material" uid="uid://bujy3egn1oqac" path="res://assets/materials/pri-500.material" id="2_ps3pl"]
[ext_resource type="FontVariation" uid="uid://d2ofyimg5s65q" path="res://assets/fonts/ui_font_500.tres" id="4_gxfp3"]
[node name="Chat" type="Node3D"]
transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 3.41237e-06, 0, 0)
script = ExtResource("1_rbrak")
text = "Hello World"
flip = true
[node name="chat_bubble" parent="." instance=ExtResource("1_lsdcs")]
transform = Transform3D(0.2, 0, 0, 0, 0.2, 0, 0, 0, 0.2, -0.0154175, 0, 0.0710473)
[node name="Armature" parent="chat_bubble" index="0"]
transform = Transform3D(1, 0, 0, 0, 0, 1, 0, -1, 0, 0.5, 0, 0)
[node name="Cube" parent="chat_bubble/Armature/Skeleton3D" index="0"]
transform = Transform3D(-4.37114e-08, -1, -4.37114e-08, 0, -4.37114e-08, 1, -1, 4.37114e-08, 1.91069e-15, 0, 0.35, 0)
material_override = ExtResource("2_ps3pl")
[node name="Label3D" type="Label3D" parent="."]
transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0.006)
pixel_size = 0.001
text = "Hello World"
font = ExtResource("4_gxfp3")
font_size = 20
outline_size = 0
horizontal_alignment = 0
[editable path="chat_bubble"]

View File

@ -0,0 +1,42 @@
@tool
extends Node3D
const material: StandardMaterial3D = preload ("res://assets/materials/pri-500.material")
var time: float = 0.0
const DOT_COUNT = 8
const RADIUS = 0.025
func _ready():
generate_meshes()
func generate_meshes():
for i in range(DOT_COUNT):
var mesh := MeshInstance3D.new()
mesh.mesh = CylinderMesh.new()
mesh.mesh.top_radius = 0.005
mesh.mesh.bottom_radius = 0.005
mesh.mesh.height = 0.005
mesh.material_override = material.duplicate()
mesh.material_override.transparency = BaseMaterial3D.TRANSPARENCY_ALPHA
add_child(mesh)
mesh.position = Vector3(sin(i * PI / DOT_COUNT * 2), cos(i * PI / DOT_COUNT * 2), 0) * RADIUS
mesh.rotation_degrees = Vector3(90, 0, 0)
func _process(delta):
if !visible:
return
time += delta
for i in range(get_child_count()):
var mesh := get_child(i)
if mesh == null:
return
mesh.material_override.albedo_color.a = saw_tooth(i / float(get_child_count()) + time)
func saw_tooth(x: float) -> float:
return 1 - fmod(x, 1)

View File

@ -0,0 +1,6 @@
[gd_scene load_steps=2 format=3 uid="uid://b0d1582vpkr8m"]
[ext_resource type="Script" path="res://content/system/assist/loader.gd" id="1_3bi3s"]
[node name="Loader" type="Node3D"]
script = ExtResource("1_3bi3s")

View File

@ -1,5 +1,7 @@
extends RefCounted
const FontTools = preload ("res://lib/utils/font_tools.gd")
var label: Label3D
var text: String = ""
@ -73,14 +75,13 @@ func _calculate_caret_position(click_pos_x: float):
return gap_offsets.size() - 1
func _calculate_text_gaps():
var font = label.get_font()
var offsets = [0.0]
for i in range(text.length()):
var chars = text.substr(0, i + 1) # Can't use single chars because of kerning.
var size = font.get_string_size(chars, HORIZONTAL_ALIGNMENT_CENTER, -1, label.font_size)
var size = FontTools.get_font_size(label, chars)
offsets.append(size.x * label.pixel_size)
offsets.append(size.x)
return offsets

View File

@ -1,8 +1,36 @@
const HASS_API = preload ("../hass.gd")
signal on_wake_word(wake_word: String)
signal on_stt_message(message: String)
signal on_tts_message(message: String)
signal on_tts_sound(sound: AudioStreamMP3)
var api: HASS_API
var pipe_running := false
var handler_id := 0
var wake_word = null:
set(value):
if value != wake_word&&value != null:
on_wake_word.emit(value)
wake_word = value
var stt_message = null:
set(value):
if value != stt_message&&value != null:
on_stt_message.emit(value)
stt_message = value
var tts_message = null:
set(value):
if value != tts_message&&value != null:
on_tts_message.emit(value)
tts_message = value
var tts_sound = null:
set(value):
if value != tts_sound&&value != null:
on_tts_sound.emit(value)
tts_sound = value
func _init(hass: HASS_API):
self.api = hass
@ -19,7 +47,7 @@ func start_wakeword():
api.send_packet({
"type": "assist_pipeline/run",
"start_stage": "wake_word",
"end_stage": "intent",
"end_stage": "tts",
"input": {
"timeout": 5,
"sample_rate": 16000
@ -50,21 +78,59 @@ func handle_message(message: Dictionary):
if event.has("type") == false:
return
print(event["type"])
print(message)
match event["type"]:
"run-start":
print("Pipeline started")
pipe_running = true
handler_id = event["data"]["runner_data"]["stt_binary_handler_id"]
"wake_word-end":
if pipe_running == false:
return
if event["data"]["wake_word_output"].has("wake_word_phrase") == false:
return
wake_word = event["data"]["wake_word_output"]["wake_word_phrase"]
"stt-end":
if pipe_running == false:
return
if event["data"]["stt_output"].has("text") == false:
return
stt_message = event["data"]["stt_output"]["text"]
"intent-end":
if pipe_running == false:
return
tts_message = event["data"]["intent_output"]["response"]["speech"]["plain"]["speech"]
"tts-end":
if pipe_running == false:
return
if event["data"]["tts_output"].has("url") == false:
return
var headers = PackedStringArray(["Authorization: Bearer %s" % api.token, "Content-Type: application/json"])
var url = "%s://%s%s" % ["https" if api.url.begins_with("wss") else "http", api.url.split("//")[1],event["data"]["tts_output"]["url"]]
Request.request(url, headers, HTTPClient.METHOD_GET)
var response = await Request.request_completed
if response[0] != HTTPRequest.RESULT_SUCCESS:
return
var sound = AudioStreamMP3.new()
sound.data = response[3]
tts_sound = sound
"run-end":
pipe_running = false
wake_word = null
handler_id = 0
"wake_word-start":
# handle trigger message
pass
"wake_word-end":
# handle trigger message
pass
_:
pass

9
lib/utils/font_tools.gd Normal file
View File

@ -0,0 +1,9 @@
static func get_font_size(label: Label3D, chars=null):
var font = label.font
if font == null:
return Vector2(0, 0)
var size = font.get_string_size(label.text if chars == null else chars, label.horizontal_alignment, label.width, label.font_size) * label.pixel_size
return size