diff --git a/base-config.yaml b/base-config.yaml index 249b9bb..0966fbd 100644 --- a/base-config.yaml +++ b/base-config.yaml @@ -14,5 +14,8 @@ allowed_rooms: [] # Optional text to guide the model's style or to continue from a previous audio segment. The prompt should match the language of the audio. prompt: "以下是普通话录制的会议记录:" +#Optional should OpenAI re-read the transcript to detect reminders and events ? +search_reminders_and_events: false + # The language of the input audio. Providing the input language in ISO-639-1 format (Chinese: "zh") will improve accuracy and latency, leave empty by default language: diff --git a/maubot.yaml b/maubot.yaml index b8d0706..769d3f1 100644 --- a/maubot.yaml +++ b/maubot.yaml @@ -1,6 +1,6 @@ maubot: 0.1.0 id: nigzu.com.maubot-stt -version: 0.2.6 +version: 0.3.0 license: MIT modules: - openai-whisper diff --git a/openai-whisper.py b/openai-whisper.py index bc4aa36..9065f6f 100644 --- a/openai-whisper.py +++ b/openai-whisper.py @@ -19,6 +19,7 @@ class Config(BaseProxyConfig): helper.copy("allowed_users") helper.copy("allowed_rooms") helper.copy("prompt") + helper.copy("search_reminders_and_events") helper.copy("language") class WhisperPlugin(Plugin): @@ -32,6 +33,7 @@ class WhisperPlugin(Plugin): self.language = self.config['language'] self.allowed_users = self.config['allowed_users'] self.allowed_rooms = self.config['allowed_rooms'] + self.search_reminders_and_events = self.config['search_reminders_and_events'] self.log.debug("Whisper plugin started") async def should_respond(self, event: MessageEvent) -> bool: @@ -54,12 +56,16 @@ class WhisperPlugin(Plugin): try: await event.mark_read() await self.client.set_typing(event.room_id, timeout=99999) + + self.log.error(event) audio_bytes = await self.client.download_media(url=event.content.url) transcription = await self.transcribe_audio(audio_bytes) + + if self.search_reminders_and_events: + transcription = await self.study_transcribe(transcription) await self.client.set_typing(event.room_id, timeout=0) - content = TextMessageEventContent( msgtype=MessageType.TEXT, body=transcription, @@ -104,6 +110,40 @@ class WhisperPlugin(Plugin): self.log.exception(f"Failed to transcribe audio, msg: {e}") return "Sorry, an error occurred while transcribing the audio." + async def study_transcribe(self, transcription: str) -> str: + prompt = f""" + Voici la transcription du message vocal : + + {transcription} + Ton objectif est d'analyser cette transcription afin de déterminer si l'utilisateur tente de créer un rappel ou un évènement. + + - Si l'utilisateur essaie de créer un rappel, la sortie doit prendre la forme : + !rappel + - Si l'utilisateur essaie de créer un évènement, la sortie doit prendre la forme : + !agenda ##ROOM## + - Si l'utilisateur ne cherche ni à créer un rappel ni un évènement, renvoie seulement la transcription telle quelle, sans ajout d'explication, de texte supplémentaire ou de ponctuation superflue. + Ne fournis aucun autre texte ni explication dans ta réponse, uniquement la sortie demandée. + """ + + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}" + } + + data = { + "model": "gpt-4", + "messages": [ + {"role": "user", "content": prompt} + ], + "temperature": 0.7 + } + + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=headers, json=data) as response: + response_json = await response.json() + return response_json.get('choices', [])[0].get('message', {}).get('content', transcription) + @classmethod def get_config_class(cls) -> Type[BaseProxyConfig]: return Config