Hamed744 commited on
Commit
1e6b325
·
verified ·
1 Parent(s): c580b6f

Update src/lib/multimodal-live-client.ts

Browse files
Files changed (1) hide show
  1. src/lib/multimodal-live-client.ts +155 -162
src/lib/multimodal-live-client.ts CHANGED
@@ -14,7 +14,7 @@
14
  * limitations under the License.
15
  */
16
 
17
- import { Content, GenerativeContentBlob, Part, InlineDataPart } from "@google/generative-ai";
18
  import { EventEmitter } from "eventemitter3";
19
  import { difference } from "lodash";
20
  import {
@@ -39,37 +39,12 @@ import {
39
  } from "../multimodal-live-types";
40
  import { blobToJSON, base64ToArrayBuffer } from "./utils";
41
 
42
- function arrayBufferToBase64(buffer: ArrayBuffer): string {
43
- let binary = '';
44
- const bytes = new Uint8Array(buffer);
45
- const len = bytes.byteLength;
46
- for (let i = 0; i < len; i++) {
47
- binary += String.fromCharCode(bytes[i]);
48
- }
49
- if (typeof btoa === 'function') {
50
- return btoa(binary);
51
- } else if (typeof Buffer !== 'undefined') {
52
- return Buffer.from(buffer).toString('base64');
53
- } else {
54
- throw new Error("Cannot convert ArrayBuffer to Base64 in this environment.");
55
- }
56
- }
57
-
58
- function isInlineDataPart(part: Part): part is InlineDataPart {
59
- return (
60
- typeof part === 'object' &&
61
- part !== null &&
62
- 'inlineData' in part &&
63
- typeof (part as any).inlineData === 'object' &&
64
- (part as any).inlineData !== null &&
65
- typeof (part as any).inlineData.mimeType === 'string' &&
66
- typeof (part as any).inlineData.data === 'string'
67
- );
68
- }
69
-
70
  interface MultimodalLiveClientEventTypes {
71
  open: () => void;
72
- log: (log: StreamingLog) => void; // این هنوز وجود دارد تا side-panel کار کند
73
  close: (event: CloseEvent) => void;
74
  audio: (data: ArrayBuffer) => void;
75
  content: (data: ServerContent) => void;
@@ -85,95 +60,78 @@ export type MultimodalLiveAPIClientConnection = {
85
  apiKey?: string;
86
  };
87
 
 
 
 
 
 
88
  export class MultimodalLiveClient extends EventEmitter<MultimodalLiveClientEventTypes> {
89
  public ws: WebSocket | null = null;
90
  protected config: LiveConfig | null = null;
91
  public url: string;
92
- private readonly AUDIO_SAMPLE_RATE = 16000;
93
- private readonly AUDIO_MIME_TYPE_BASE = `audio/l16`;
94
- private readonly AUDIO_MIME_TYPE_WITH_RATE = `${this.AUDIO_MIME_TYPE_BASE};rate=${this.AUDIO_SAMPLE_RATE}`;
95
-
96
- // --- 👇 لاگر داخلی برای ارسال به پنل لاگ، بدون console.log زیاد 👇 ---
97
- private logger: ((type: string, message: StreamingLog["message"], count?: number) => void) | null = null;
98
 
99
  constructor({ url, apiKey }: MultimodalLiveAPIClientConnection = {}) {
100
  super();
101
- console.log('🔧 Initializing MultimodalLiveClient...'); // فقط لاگ اولیه
102
  this.url = url || `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
103
- // اتصال logger داخلی به emit کردن رویداد log
104
- this.logger = (type, message, count) => {
105
- const logEntry: StreamingLog = { date: new Date(), type, message, count };
106
- this.emit("log", logEntry);
107
- };
108
  }
109
 
110
- // تابع log عمومی حذف شد، از logger داخلی استفاده می‌شود
 
 
 
 
 
 
 
111
 
112
  connect(config: LiveConfig): Promise<boolean> {
113
- this.logger?.("info", "Attempting WebSocket connection...", undefined);
114
  this.config = config;
115
-
116
- if (this.ws) {
117
- this.disconnect();
118
- }
119
 
120
  const ws = new WebSocket(this.url);
121
- ws.binaryType = "arraybuffer";
122
 
123
  ws.addEventListener("message", async (evt: MessageEvent) => {
124
- if (evt.data instanceof ArrayBuffer || evt.data instanceof Blob) {
125
- try {
126
- const messageText = await (evt.data instanceof Blob ? evt.data.text() : new TextDecoder().decode(evt.data));
127
- const response: LiveIncomingMessage = JSON.parse(messageText);
128
- this.receiveParsed(response);
129
- } catch (e) {
130
- console.error("Error parsing received binary message:", e);
131
- this.logger?.("error", "Error parsing received binary message", undefined);
132
- }
133
  } else {
134
- try {
135
- const response: LiveIncomingMessage = JSON.parse(evt.data);
136
- this.receiveParsed(response);
137
- } catch(e) {
138
- console.error("Error parsing received text message:", e, evt.data);
139
- this.logger?.("error", `Error parsing text message: ${evt.data ? evt.data.substring(0, 100) + '...' : 'empty'}`, undefined);
140
- }
141
  }
142
  });
143
-
144
  return new Promise((resolve, reject) => {
145
  const onError = (ev: Event) => {
 
146
  const message = `Could not connect to "${this.url}"`;
147
- console.error("WebSocket connection error:", message, ev);
148
- this.logger?.(`error.connect`, message, undefined);
149
  reject(new Error(message));
150
  };
151
  ws.addEventListener("error", onError);
152
-
153
  ws.addEventListener("open", (ev: Event) => {
154
- this.logger?.(`client.${ev.type}`, `connected to socket`, undefined);
 
 
 
 
 
 
155
  this.emit("open");
156
 
157
  this.ws = ws;
158
 
159
- if (!this.config) { // این چک باید بماند
160
- console.error("❌ Config not set when WebSocket opened!");
161
- this.logger?.("error", "Config not set when WebSocket opened!", undefined);
162
- reject("Invalid config state during WebSocket open");
163
- return;
164
- }
165
-
166
  const setupMessage: SetupMessage = {
167
  setup: this.config,
168
  };
169
  this._sendDirect(setupMessage);
170
- this.logger?.("client.send.setup", setupMessage, undefined); // ارسال به پنل لاگ
171
-
172
- // --- پیام‌های تحریک کننده و صدای سکوت حذف شده‌اند ---
173
 
174
  ws.removeEventListener("error", onError);
175
-
176
  ws.addEventListener("close", (ev: CloseEvent) => {
 
 
177
  let reason = ev.reason || "";
178
  if (reason.toLowerCase().includes("error")) {
179
  const prelude = "ERROR]";
@@ -185,12 +143,11 @@ export class MultimodalLiveClient extends EventEmitter<MultimodalLiveClientEvent
185
  );
186
  }
187
  }
188
- this.logger?.(
189
- `server.close`,
190
- `disconnected ${reason ? `with reason: ${reason}` : `(code: ${ev.code})`}`,
191
- ev.code
192
  );
193
- this.disconnect(ws);
194
  this.emit("close", ev);
195
  });
196
  resolve(true);
@@ -199,137 +156,173 @@ export class MultimodalLiveClient extends EventEmitter<MultimodalLiveClientEvent
199
  }
200
 
201
  disconnect(ws?: WebSocket) {
202
- const wsToClose = ws || this.ws;
203
- if (wsToClose && wsToClose.readyState !== WebSocket.CLOSED && wsToClose.readyState !== WebSocket.CLOSING) {
204
- this.logger?.("info", `Closing WebSocket connection (readyState: ${wsToClose.readyState})`, undefined);
205
- wsToClose.close();
206
- if (this.ws === wsToClose) {
207
- this.ws = null;
208
- }
 
209
  return true;
210
  }
 
211
  return false;
212
  }
213
 
214
- private createSilentAudioChunk(durationMs: number): Part {
215
- // این تابع دیگر استفاده نمی‌شود اما برای کامل بودن نگه داشته شده
216
- const bytesPerSample = 2;
217
- const numberOfSamples = Math.floor(this.AUDIO_SAMPLE_RATE * (durationMs / 1000));
218
- const bufferSize = numberOfSamples * bytesPerSample;
219
- const buffer = new ArrayBuffer(bufferSize);
220
- const base64Data = arrayBufferToBase64(buffer);
221
- return {
222
- inlineData: {
223
- mimeType: this.AUDIO_MIME_TYPE_WITH_RATE,
224
- data: base64Data,
225
- }
226
- };
227
- }
228
-
229
- protected receiveParsed(response: LiveIncomingMessage) {
230
- // --- 👇 لاگ دریافت پیام حذف شد 👇 ---
231
- // this.logger?.("server.receive", response);
232
-
233
  if (isToolCallMessage(response)) {
234
- this.logger?.("server.toolCall", response, undefined);
 
235
  this.emit("toolcall", response.toolCall);
236
  return;
237
  }
238
  if (isToolCallCancellationMessage(response)) {
239
- this.logger?.("server.toolCallCancellation", response, undefined);
 
240
  this.emit("toolcallcancellation", response.toolCallCancellation);
241
  return;
242
  }
 
243
  if (isSetupCompleteMessage(response)) {
244
- this.logger?.("server.setupComplete", response, undefined);
 
245
  this.emit("setupcomplete");
246
  return;
247
  }
 
 
 
248
  if (isServerContentMessage(response)) {
249
  const { serverContent } = response;
250
  if (isInterrupted(serverContent)) {
251
- this.logger?.("server.interrupted", response, undefined);
252
  this.emit("interrupted");
253
  return;
254
  }
255
  if (isTurnComplete(serverContent)) {
256
- this.logger?.("server.turnComplete", response, undefined);
257
  this.emit("turncomplete");
 
258
  }
 
259
  if (isModelTurn(serverContent)) {
260
  let parts: Part[] = serverContent.modelTurn.parts;
261
- const audioParts = parts.filter(isInlineDataPart)
262
- .filter(part => part.inlineData.mimeType.startsWith(this.AUDIO_MIME_TYPE_BASE));
 
 
 
 
 
 
263
  const otherParts = difference(parts, audioParts);
264
- let audioByteLength = 0;
265
- audioParts.forEach((part) => {
266
- if (part.inlineData.data) {
267
- try {
268
- const data = base64ToArrayBuffer(part.inlineData.data);
269
- this.emit("audio", data); // رویداد صدا هنوز emit می‌شود
270
- audioByteLength += data.byteLength;
271
- } catch (e) {
272
- console.error("Error decoding base64 audio:", e);
273
- this.logger?.("error", "Error decoding base64 audio", undefined);
274
- }
275
  }
276
  });
277
- // --- 👇 لاگ بایت صدا حذف شد 👇 ---
278
- // if (audioByteLength > 0) {
279
- // this.logger?.(`server.audio`, `buffer`, audioByteLength);
280
- // }
281
- if (!otherParts.length) return;
282
 
283
  parts = otherParts;
284
- const modelTurnContent: ModelTurn = { modelTurn: { parts } };
285
- this.emit("content", modelTurnContent);
286
- // --- 👇 لاگ محتوای غیرصوتی حذف شد 👇 ---
287
- // this.logger?.(`server.content`, response);
288
  }
289
  } else {
290
- // لاگ پیام ناشناخته را نگه می‌داریم
291
- console.log("Received unrecognized message structure:", response);
292
- this.logger?.("server.unknown", response, undefined);
293
  }
294
  }
295
 
 
 
 
296
  sendRealtimeInput(chunks: GenerativeContentBlob[]) {
297
- // --- 👇 لاگ ارسال صدا حذف شد 👇 ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  const data: RealtimeInputMessage = {
299
- realtimeInput: { mediaChunks: chunks },
 
 
300
  };
301
  this._sendDirect(data);
302
- // this.logger?.(`client.realtimeInput`, `audio/video chunks`, chunks.length);
303
  }
304
 
 
 
 
305
  sendToolResponse(toolResponse: ToolResponseMessage["toolResponse"]) {
306
- const message: ToolResponseMessage = { toolResponse };
 
 
 
307
  this._sendDirect(message);
308
- this.logger?.(`client.toolResponse`, message, undefined); // این لاگ معمولا کم تکرار است
309
  }
310
 
 
 
 
311
  send(parts: Part | Part[], turnComplete: boolean = true) {
312
  parts = Array.isArray(parts) ? parts : [parts];
313
- const content: Content = { role: "user", parts };
 
 
 
 
314
  const clientContentRequest: ClientContentMessage = {
315
- clientContent: { turns: [content], turnComplete },
 
 
 
316
  };
 
317
  this._sendDirect(clientContentRequest);
318
- this.logger?.(`client.send.content`, clientContentRequest, undefined); // این لاگ هم معمولا کم تکرار است
319
  }
320
 
 
 
 
 
321
  _sendDirect(request: object) {
322
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
323
- console.error("WebSocket is not connected or not open. Cannot send message:", request);
324
- this.logger?.("error", "Attempted to send message while WebSocket not open", undefined);
325
- return;
326
- }
327
- try {
328
- const str = JSON.stringify(request);
329
- this.ws.send(str);
330
- } catch (error) {
331
- console.error("Error stringifying or sending message:", error, request);
332
- this.logger?.("error", `Error sending message: ${error}`, undefined);
333
  }
 
 
334
  }
335
- }
 
14
  * limitations under the License.
15
  */
16
 
17
+ import { Content, GenerativeContentBlob, Part } from "@google/generative-ai";
18
  import { EventEmitter } from "eventemitter3";
19
  import { difference } from "lodash";
20
  import {
 
39
  } from "../multimodal-live-types";
40
  import { blobToJSON, base64ToArrayBuffer } from "./utils";
41
 
42
+ /**
43
+ * the events that this client will emit
44
+ */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  interface MultimodalLiveClientEventTypes {
46
  open: () => void;
47
+ log: (log: StreamingLog) => void;
48
  close: (event: CloseEvent) => void;
49
  audio: (data: ArrayBuffer) => void;
50
  content: (data: ServerContent) => void;
 
60
  apiKey?: string;
61
  };
62
 
63
+ /**
64
+ * A event-emitting class that manages the connection to the websocket and emits
65
+ * events to the rest of the application.
66
+ * If you dont want to use react you can still use this.
67
+ */
68
  export class MultimodalLiveClient extends EventEmitter<MultimodalLiveClientEventTypes> {
69
  public ws: WebSocket | null = null;
70
  protected config: LiveConfig | null = null;
71
  public url: string;
 
 
 
 
 
 
72
 
73
  constructor({ url, apiKey }: MultimodalLiveAPIClientConnection = {}) {
74
  super();
75
+ console.log('🔧 Initializing MultimodalLiveClient with URL:', url || `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`);
76
  this.url = url || `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/ws`;
77
+ this.send = this.send.bind(this);
 
 
 
 
78
  }
79
 
80
+ log(type: string, message: StreamingLog["message"]) {
81
+ const log: StreamingLog = {
82
+ date: new Date(),
83
+ type,
84
+ message,
85
+ };
86
+ this.emit("log", log);
87
+ }
88
 
89
  connect(config: LiveConfig): Promise<boolean> {
90
+ console.log('🔌 Attempting WebSocket connection to:', this.url);
91
  this.config = config;
92
+ console.log('🔗 MultimodalLiveClient: Starting WebSocket connection to:', this.url);
 
 
 
93
 
94
  const ws = new WebSocket(this.url);
 
95
 
96
  ws.addEventListener("message", async (evt: MessageEvent) => {
97
+ console.log('📨 Received WebSocket message:', evt.data instanceof Blob ? 'Blob data' : evt.data);
98
+ if (evt.data instanceof Blob) {
99
+ console.log('📩 MultimodalLiveClient: Received blob message');
100
+ this.receive(evt.data);
 
 
 
 
 
101
  } else {
102
+ console.log("non blob message", evt);
 
 
 
 
 
 
103
  }
104
  });
 
105
  return new Promise((resolve, reject) => {
106
  const onError = (ev: Event) => {
107
+ this.disconnect(ws);
108
  const message = `Could not connect to "${this.url}"`;
109
+ this.log(`server.${ev.type}`, message);
 
110
  reject(new Error(message));
111
  };
112
  ws.addEventListener("error", onError);
 
113
  ws.addEventListener("open", (ev: Event) => {
114
+ console.log('✅ WebSocket connection opened successfully');
115
+ if (!this.config) {
116
+ reject("Invalid config sent to `connect(config)`");
117
+ return;
118
+ }
119
+ console.log('✨ MultimodalLiveClient: WebSocket connection established');
120
+ this.log(`client.${ev.type}`, `connected to socket`);
121
  this.emit("open");
122
 
123
  this.ws = ws;
124
 
 
 
 
 
 
 
 
125
  const setupMessage: SetupMessage = {
126
  setup: this.config,
127
  };
128
  this._sendDirect(setupMessage);
129
+ this.log("client.send", "setup");
 
 
130
 
131
  ws.removeEventListener("error", onError);
 
132
  ws.addEventListener("close", (ev: CloseEvent) => {
133
+ console.log(ev);
134
+ this.disconnect(ws);
135
  let reason = ev.reason || "";
136
  if (reason.toLowerCase().includes("error")) {
137
  const prelude = "ERROR]";
 
143
  );
144
  }
145
  }
146
+ console.log('📝 Close reason:', reason || 'No reason provided');
147
+ this.log(
148
+ `server.${ev.type}`,
149
+ `disconnected ${reason ? `with reason: ${reason}` : ``}`,
150
  );
 
151
  this.emit("close", ev);
152
  });
153
  resolve(true);
 
156
  }
157
 
158
  disconnect(ws?: WebSocket) {
159
+ console.log('🔌 Attempting to disconnect WebSocket');
160
+ // could be that this is an old websocket and theres already a new instance
161
+ // only close it if its still the correct reference
162
+ if ((!ws || this.ws === ws) && this.ws) {
163
+ console.log('🔒 Closing WebSocket connection');
164
+ this.ws.close();
165
+ this.ws = null;
166
+ this.log("client.close", `Disconnected`);
167
  return true;
168
  }
169
+ console.log('⚠️ No active WebSocket to disconnect');
170
  return false;
171
  }
172
 
173
+ protected async receive(blob: Blob) {
174
+ const response: LiveIncomingMessage = (await blobToJSON(
175
+ blob,
176
+ )) as LiveIncomingMessage;
177
+ console.log('📥 Received message:', response);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if (isToolCallMessage(response)) {
179
+ console.log('🛠️ MultimodalLiveClient: Received tool call');
180
+ this.log("server.toolCall", response);
181
  this.emit("toolcall", response.toolCall);
182
  return;
183
  }
184
  if (isToolCallCancellationMessage(response)) {
185
+ console.log('🚫 MultimodalLiveClient: Received tool call cancellation');
186
+ this.log("receive.toolCallCancellation", response);
187
  this.emit("toolcallcancellation", response.toolCallCancellation);
188
  return;
189
  }
190
+
191
  if (isSetupCompleteMessage(response)) {
192
+ console.log('🎉 MultimodalLiveClient: Setup complete received');
193
+ this.log("server.send", "setupComplete");
194
  this.emit("setupcomplete");
195
  return;
196
  }
197
+
198
+ // this json also might be `contentUpdate { interrupted: true }`
199
+ // or contentUpdate { end_of_turn: true }
200
  if (isServerContentMessage(response)) {
201
  const { serverContent } = response;
202
  if (isInterrupted(serverContent)) {
203
+ this.log("receive.serverContent", "interrupted");
204
  this.emit("interrupted");
205
  return;
206
  }
207
  if (isTurnComplete(serverContent)) {
208
+ this.log("server.send", "turnComplete");
209
  this.emit("turncomplete");
210
+ //plausible theres more to the message, continue
211
  }
212
+
213
  if (isModelTurn(serverContent)) {
214
  let parts: Part[] = serverContent.modelTurn.parts;
215
+
216
+ // when its audio that is returned for modelTurn
217
+ const audioParts = parts.filter(
218
+ (p) => p.inlineData && p.inlineData.mimeType.startsWith("audio/pcm"),
219
+ );
220
+ const base64s = audioParts.map((p) => p.inlineData?.data);
221
+
222
+ // strip the audio parts out of the modelTurn
223
  const otherParts = difference(parts, audioParts);
224
+ // console.log("otherParts", otherParts);
225
+
226
+ base64s.forEach((b64) => {
227
+ if (b64) {
228
+ const data = base64ToArrayBuffer(b64);
229
+ this.emit("audio", data);
230
+ this.log(`server.audio`, `buffer (${data.byteLength})`);
 
 
 
 
231
  }
232
  });
233
+ if (!otherParts.length) {
234
+ return;
235
+ }
 
 
236
 
237
  parts = otherParts;
238
+
239
+ const content: ModelTurn = { modelTurn: { parts } };
240
+ this.emit("content", content);
241
+ this.log(`server.content`, response);
242
  }
243
  } else {
244
+ console.log("received unmatched message", response);
 
 
245
  }
246
  }
247
 
248
+ /**
249
+ * send realtimeInput, this is base64 chunks of "audio/pcm" and/or "image/jpg"
250
+ */
251
  sendRealtimeInput(chunks: GenerativeContentBlob[]) {
252
+ let hasAudio = false;
253
+ let hasVideo = false;
254
+ for (let i = 0; i < chunks.length; i++) {
255
+ const ch = chunks[i];
256
+ if (ch.mimeType.includes("audio")) {
257
+ hasAudio = true;
258
+ }
259
+ if (ch.mimeType.includes("image")) {
260
+ hasVideo = true;
261
+ }
262
+ if (hasAudio && hasVideo) {
263
+ break;
264
+ }
265
+ }
266
+ const message =
267
+ hasAudio && hasVideo
268
+ ? "audio + video"
269
+ : hasAudio
270
+ ? "audio"
271
+ : hasVideo
272
+ ? "video"
273
+ : "unknown";
274
+
275
  const data: RealtimeInputMessage = {
276
+ realtimeInput: {
277
+ mediaChunks: chunks,
278
+ },
279
  };
280
  this._sendDirect(data);
281
+ this.log(`client.realtimeInput`, message);
282
  }
283
 
284
+ /**
285
+ * send a response to a function call and provide the id of the functions you are responding to
286
+ */
287
  sendToolResponse(toolResponse: ToolResponseMessage["toolResponse"]) {
288
+ const message: ToolResponseMessage = {
289
+ toolResponse,
290
+ };
291
+
292
  this._sendDirect(message);
293
+ this.log(`client.toolResponse`, message);
294
  }
295
 
296
+ /**
297
+ * send normal content parts such as { text }
298
+ */
299
  send(parts: Part | Part[], turnComplete: boolean = true) {
300
  parts = Array.isArray(parts) ? parts : [parts];
301
+ const content: Content = {
302
+ role: "user",
303
+ parts,
304
+ };
305
+
306
  const clientContentRequest: ClientContentMessage = {
307
+ clientContent: {
308
+ turns: [content],
309
+ turnComplete,
310
+ },
311
  };
312
+
313
  this._sendDirect(clientContentRequest);
314
+ this.log(`client.send`, clientContentRequest);
315
  }
316
 
317
+ /**
318
+ * used internally to send all messages
319
+ * don't use directly unless trying to send an unsupported message type
320
+ */
321
  _sendDirect(request: object) {
322
+ if (!this.ws) {
323
+ throw new Error("WebSocket is not connected");
 
 
 
 
 
 
 
 
 
324
  }
325
+ const str = JSON.stringify(request);
326
+ this.ws.send(str);
327
  }
328
+ }