diff --git a/assets/chat_bubble.blend b/assets/chat_bubble.blend new file mode 100644 index 0000000..f944d04 --- /dev/null +++ b/assets/chat_bubble.blend @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5fa5f006a42d87f43a8b411e4e4bf64a8b4fbbaedd0d02579134a8fa59161eb +size 894176 diff --git a/assets/chat_bubble.blend1 b/assets/chat_bubble.blend1 new file mode 100644 index 0000000..6ee4542 Binary files /dev/null and b/assets/chat_bubble.blend1 differ diff --git a/assets/models/chat_bubble/chat_bubble.glb b/assets/models/chat_bubble/chat_bubble.glb new file mode 100644 index 0000000..8cf0d61 --- /dev/null +++ b/assets/models/chat_bubble/chat_bubble.glb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f215158ae0aba0113e0077040342fc1b508cfec0a3a1e022c9ce0c16973e2ee1 +size 17828 diff --git a/assets/models/chat_bubble/chat_bubble.glb.import b/assets/models/chat_bubble/chat_bubble.glb.import new file mode 100644 index 0000000..f0166c8 --- /dev/null +++ b/assets/models/chat_bubble/chat_bubble.glb.import @@ -0,0 +1,34 @@ +[remap] + +importer="scene" +importer_version=1 +type="PackedScene" +uid="uid://b12raorbby1xd" +path="res://.godot/imported/chat_bubble.glb-03622c64b96f5698360bcfb8a4904483.scn" + +[deps] + +source_file="res://assets/models/chat_bubble/chat_bubble.glb" +dest_files=["res://.godot/imported/chat_bubble.glb-03622c64b96f5698360bcfb8a4904483.scn"] + +[params] + +nodes/root_type="" +nodes/root_name="" +nodes/apply_root_scale=true +nodes/root_scale=1.0 +meshes/ensure_tangents=true +meshes/generate_lods=true +meshes/create_shadow_meshes=true +meshes/light_baking=1 +meshes/lightmap_texel_size=0.2 +meshes/force_disable_compression=false +skins/use_named_skins=true +animation/import=true +animation/fps=30 +animation/trimming=false +animation/remove_immutable_tracks=true +import_script/path="" +_subresources={} +gltf/naming_version=1 +gltf/embedded_image_handling=1 diff --git a/content/main.tscn b/content/main.tscn index b852603..19643a5 100644 --- a/content/main.tscn +++ b/content/main.tscn @@ -85,5 +85,6 @@ transform = Transform3D(0.499999, -0.000139169, -6.50204e-05, 5.24307e-05, 0.353 [node name="House" parent="." instance=ExtResource("9_np6mw")] [node name="Assist" parent="." instance=ExtResource("12_8av8q")] +transform = Transform3D(1, -1.39636e-11, 0, 9.47986e-12, 1, 0, 0, 0, 1, 0.000231838, -4.01369e-06, -0.855612) [editable path="XROrigin3D/XRControllerLeft"] diff --git a/content/system/assist/assist.gd b/content/system/assist/assist.gd index e277c3d..81de3dc 100644 --- a/content/system/assist/assist.gd +++ b/content/system/assist/assist.gd @@ -1,24 +1,79 @@ extends Node3D const sample_hold = preload ("res://lib/utils/sample_hold.gd") +const Chat = preload ("./chat.gd") const audio_freq = 44100 const target_freq = 16000 const sample_rate_ratio: float = audio_freq / target_freq * 1.5 var effect: AudioEffectCapture -@export var input_threshold: float = 0.05 +@export var input_threshold: float = 0.1 @onready var audio_recorder: AudioStreamPlayer = $AudioStreamRecord -@onready var timer: Timer = $Timer +@onready var audio_timer: Timer = $AudioTimer +@onready var visual_timer: Timer = $VisualTimer +@onready var audio_player_3d: AudioStreamPlayer3D = $AudioStreamPlayer3D +@onready var chat_user: Chat = $ChatUser +@onready var chat_assistant: Chat = $ChatAssistant +@onready var loader: Node3D = $Loader +@onready var camera = $"/root/Main/XROrigin3D/XRCamera3D" + +var running := true func _ready(): var index = AudioServer.get_bus_index("Record") effect = AudioServer.get_bus_effect(index, 0) - timer.timeout.connect(func(): + finish() + + audio_timer.timeout.connect(func(): HomeApi.api.assist_handler.send_data(PackedByteArray()) ) + HomeApi.api.assist_handler.on_wake_word.connect(func(text): + loader.visible=true + chat_user.visible=false + chat_assistant.visible=false + global_position=camera.global_position + camera.global_transform.basis.z * - 0.5 + global_position.y *= 0.7 + global_transform.basis=Basis.looking_at((camera.global_position - global_position) * - 1) + running=true + ) + + HomeApi.api.assist_handler.on_stt_message.connect(func(text): + loader.visible=false + chat_user.visible=true + chat_user.text=text + ) + HomeApi.api.assist_handler.on_tts_message.connect(func(text): + chat_assistant.visible=true + chat_assistant.text=text + ) + + HomeApi.api.assist_handler.on_tts_sound.connect(func(audio): + print("Playing TTS ", audio.data.size()) + audio_player_3d.stream=audio + audio_player_3d.play() + visual_timer.start() + running=false + ) + + visual_timer.timeout.connect(func(): + if audio_player_3d.playing == false: + finish() + else: + await audio_player_3d.finished + finish() + ) + +func finish(): + if running: + return + + chat_user.visible = false + chat_assistant.visible = false + loader.visible = false + func _process(_delta): var sterioData: PackedVector2Array = effect.get_buffer(effect.get_frames_available()) @@ -41,10 +96,10 @@ func _process(_delta): data.encode_s16(i * 2, int(value * 32767)) if max_amplitude > input_threshold: - if timer.is_stopped(): + if audio_timer.is_stopped(): HomeApi.api.assist_handler.start_wakeword() - timer.start() + audio_timer.start() - if timer.is_stopped() == false: - HomeApi.api.assist_handler.send_data(data) \ No newline at end of file + if audio_timer.is_stopped() == false: + HomeApi.api.assist_handler.send_data(data) diff --git a/content/system/assist/assist.tscn b/content/system/assist/assist.tscn index 86b284b..1356257 100644 --- a/content/system/assist/assist.tscn +++ b/content/system/assist/assist.tscn @@ -1,6 +1,8 @@ -[gd_scene load_steps=3 format=3 uid="uid://oydbwnek6xb4"] +[gd_scene load_steps=5 format=3 uid="uid://oydbwnek6xb4"] [ext_resource type="Script" path="res://content/system/assist/assist.gd" id="1_5obhy"] +[ext_resource type="PackedScene" uid="uid://cy6jklyde3pgo" path="res://content/system/assist/chat.tscn" id="2_laew1"] +[ext_resource type="PackedScene" uid="uid://b0d1582vpkr8m" path="res://content/system/assist/loader.tscn" id="3_25iy1"] [sub_resource type="AudioStreamMicrophone" id="AudioStreamMicrophone_6tv2x"] @@ -12,6 +14,22 @@ stream = SubResource("AudioStreamMicrophone_6tv2x") autoplay = true bus = &"Record" -[node name="Timer" type="Timer" parent="."] +[node name="AudioTimer" type="Timer" parent="."] wait_time = 2.0 one_shot = true + +[node name="AudioStreamPlayer3D" type="AudioStreamPlayer3D" parent="."] + +[node name="VisualTimer" type="Timer" parent="."] +wait_time = 4.0 +one_shot = true + +[node name="ChatUser" parent="." instance=ExtResource("2_laew1")] +transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, -0.109997, 0.025, 0) +flip = false + +[node name="ChatAssistant" parent="." instance=ExtResource("2_laew1")] +transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, -0.0499932, -0.025, 0) +text = "Hello, World!" + +[node name="Loader" parent="." instance=ExtResource("3_25iy1")] diff --git a/content/system/assist/chat.gd b/content/system/assist/chat.gd new file mode 100644 index 0000000..2cd61a2 --- /dev/null +++ b/content/system/assist/chat.gd @@ -0,0 +1,37 @@ +@tool +extends Node3D + +const FontTools = preload ("res://lib/utils/font_tools.gd") + +@onready var label: Label3D = $Label3D +@onready var chat: Skeleton3D = $chat_bubble/Armature/Skeleton3D +@onready var model: MeshInstance3D = $chat_bubble/Armature/Skeleton3D/Cube + +@export var text := "Hello, World!": + set(value): + if !is_node_ready(): await ready + + text = value + label.text = value + update() + +@export var flip: bool = false: + set(value): + if !is_node_ready(): await ready + + flip = value + model.rotation_degrees.x = -90 if value else 90 + +const base_width = 0.8 * 0.2 + +func update(): + var text_width = FontTools.get_font_size(label).x + + var offset = (text_width - base_width) / 0.2 + + offset = max(0.0, offset) + + if flip: + offset = -offset + + chat.set_bone_pose_position(1 if flip else 0, Vector3(0, offset, 0)) \ No newline at end of file diff --git a/content/system/assist/chat.tscn b/content/system/assist/chat.tscn new file mode 100644 index 0000000..6d1136a --- /dev/null +++ b/content/system/assist/chat.tscn @@ -0,0 +1,33 @@ +[gd_scene load_steps=5 format=3 uid="uid://cy6jklyde3pgo"] + +[ext_resource type="PackedScene" uid="uid://b12raorbby1xd" path="res://assets/models/chat_bubble/chat_bubble.glb" id="1_lsdcs"] +[ext_resource type="Script" path="res://content/system/assist/chat.gd" id="1_rbrak"] +[ext_resource type="Material" uid="uid://bujy3egn1oqac" path="res://assets/materials/pri-500.material" id="2_ps3pl"] +[ext_resource type="FontVariation" uid="uid://d2ofyimg5s65q" path="res://assets/fonts/ui_font_500.tres" id="4_gxfp3"] + +[node name="Chat" type="Node3D"] +transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 3.41237e-06, 0, 0) +script = ExtResource("1_rbrak") +text = "Hello World" +flip = true + +[node name="chat_bubble" parent="." instance=ExtResource("1_lsdcs")] +transform = Transform3D(0.2, 0, 0, 0, 0.2, 0, 0, 0, 0.2, -0.0154175, 0, 0.0710473) + +[node name="Armature" parent="chat_bubble" index="0"] +transform = Transform3D(1, 0, 0, 0, 0, 1, 0, -1, 0, 0.5, 0, 0) + +[node name="Cube" parent="chat_bubble/Armature/Skeleton3D" index="0"] +transform = Transform3D(-4.37114e-08, -1, -4.37114e-08, 0, -4.37114e-08, 1, -1, 4.37114e-08, 1.91069e-15, 0, 0.35, 0) +material_override = ExtResource("2_ps3pl") + +[node name="Label3D" type="Label3D" parent="."] +transform = Transform3D(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0.006) +pixel_size = 0.001 +text = "Hello World" +font = ExtResource("4_gxfp3") +font_size = 20 +outline_size = 0 +horizontal_alignment = 0 + +[editable path="chat_bubble"] diff --git a/content/system/assist/loader.gd b/content/system/assist/loader.gd new file mode 100644 index 0000000..838be69 --- /dev/null +++ b/content/system/assist/loader.gd @@ -0,0 +1,42 @@ +@tool +extends Node3D + +const material: StandardMaterial3D = preload ("res://assets/materials/pri-500.material") +var time: float = 0.0 +const DOT_COUNT = 8 +const RADIUS = 0.025 + +func _ready(): + generate_meshes() + +func generate_meshes(): + for i in range(DOT_COUNT): + var mesh := MeshInstance3D.new() + mesh.mesh = CylinderMesh.new() + mesh.mesh.top_radius = 0.005 + mesh.mesh.bottom_radius = 0.005 + mesh.mesh.height = 0.005 + mesh.material_override = material.duplicate() + mesh.material_override.transparency = BaseMaterial3D.TRANSPARENCY_ALPHA + + add_child(mesh) + + mesh.position = Vector3(sin(i * PI / DOT_COUNT * 2), cos(i * PI / DOT_COUNT * 2), 0) * RADIUS + mesh.rotation_degrees = Vector3(90, 0, 0) + +func _process(delta): + if !visible: + return + + time += delta + + for i in range(get_child_count()): + var mesh := get_child(i) + + if mesh == null: + return + + mesh.material_override.albedo_color.a = saw_tooth(i / float(get_child_count()) + time) + +func saw_tooth(x: float) -> float: + return 1 - fmod(x, 1) \ No newline at end of file diff --git a/content/system/assist/loader.tscn b/content/system/assist/loader.tscn new file mode 100644 index 0000000..2cc43de --- /dev/null +++ b/content/system/assist/loader.tscn @@ -0,0 +1,6 @@ +[gd_scene load_steps=2 format=3 uid="uid://b0d1582vpkr8m"] + +[ext_resource type="Script" path="res://content/system/assist/loader.gd" id="1_3bi3s"] + +[node name="Loader" type="Node3D"] +script = ExtResource("1_3bi3s") diff --git a/content/ui/components/input/text_handler.gd b/content/ui/components/input/text_handler.gd index 51bef8b..9679eac 100644 --- a/content/ui/components/input/text_handler.gd +++ b/content/ui/components/input/text_handler.gd @@ -1,5 +1,7 @@ extends RefCounted +const FontTools = preload ("res://lib/utils/font_tools.gd") + var label: Label3D var text: String = "" @@ -73,14 +75,13 @@ func _calculate_caret_position(click_pos_x: float): return gap_offsets.size() - 1 func _calculate_text_gaps(): - var font = label.get_font() var offsets = [0.0] for i in range(text.length()): var chars = text.substr(0, i + 1) # Can't use single chars because of kerning. - var size = font.get_string_size(chars, HORIZONTAL_ALIGNMENT_CENTER, -1, label.font_size) + var size = FontTools.get_font_size(label, chars) - offsets.append(size.x * label.pixel_size) + offsets.append(size.x) return offsets diff --git a/lib/home_apis/hass_ws/handlers/assist.gd b/lib/home_apis/hass_ws/handlers/assist.gd index d0601ee..a7fac54 100644 --- a/lib/home_apis/hass_ws/handlers/assist.gd +++ b/lib/home_apis/hass_ws/handlers/assist.gd @@ -1,8 +1,36 @@ const HASS_API = preload ("../hass.gd") +signal on_wake_word(wake_word: String) +signal on_stt_message(message: String) +signal on_tts_message(message: String) +signal on_tts_sound(sound: AudioStreamMP3) + var api: HASS_API var pipe_running := false var handler_id := 0 +var wake_word = null: + set(value): + if value != wake_word&&value != null: + on_wake_word.emit(value) + wake_word = value + +var stt_message = null: + set(value): + if value != stt_message&&value != null: + on_stt_message.emit(value) + stt_message = value + +var tts_message = null: + set(value): + if value != tts_message&&value != null: + on_tts_message.emit(value) + tts_message = value + +var tts_sound = null: + set(value): + if value != tts_sound&&value != null: + on_tts_sound.emit(value) + tts_sound = value func _init(hass: HASS_API): self.api = hass @@ -19,7 +47,7 @@ func start_wakeword(): api.send_packet({ "type": "assist_pipeline/run", "start_stage": "wake_word", - "end_stage": "intent", + "end_stage": "tts", "input": { "timeout": 5, "sample_rate": 16000 @@ -50,21 +78,59 @@ func handle_message(message: Dictionary): if event.has("type") == false: return - print(event["type"]) + print(message) match event["type"]: "run-start": print("Pipeline started") pipe_running = true handler_id = event["data"]["runner_data"]["stt_binary_handler_id"] + "wake_word-end": + if pipe_running == false: + return + + if event["data"]["wake_word_output"].has("wake_word_phrase") == false: + return + + wake_word = event["data"]["wake_word_output"]["wake_word_phrase"] + "stt-end": + if pipe_running == false: + return + + if event["data"]["stt_output"].has("text") == false: + return + + stt_message = event["data"]["stt_output"]["text"] + "intent-end": + if pipe_running == false: + return + + tts_message = event["data"]["intent_output"]["response"]["speech"]["plain"]["speech"] + "tts-end": + if pipe_running == false: + return + + if event["data"]["tts_output"].has("url") == false: + return + + var headers = PackedStringArray(["Authorization: Bearer %s" % api.token, "Content-Type: application/json"]) + var url = "%s://%s%s" % ["https" if api.url.begins_with("wss") else "http", api.url.split("//")[1],event["data"]["tts_output"]["url"]] + + Request.request(url, headers, HTTPClient.METHOD_GET) + + var response = await Request.request_completed + + if response[0] != HTTPRequest.RESULT_SUCCESS: + return + + var sound = AudioStreamMP3.new() + sound.data = response[3] + + tts_sound = sound + "run-end": pipe_running = false + wake_word = null handler_id = 0 - "wake_word-start": - # handle trigger message - pass - "wake_word-end": - # handle trigger message - pass _: pass diff --git a/lib/utils/font_tools.gd b/lib/utils/font_tools.gd new file mode 100644 index 0000000..a8677c6 --- /dev/null +++ b/lib/utils/font_tools.gd @@ -0,0 +1,9 @@ +static func get_font_size(label: Label3D, chars=null): + var font = label.font + + if font == null: + return Vector2(0, 0) + + var size = font.get_string_size(label.text if chars == null else chars, label.horizontal_alignment, label.width, label.font_size) * label.pixel_size + + return size \ No newline at end of file