feat: excluding .mbp files

feat: checking file mime type before sending it to Whisper
feat: added transcription analysis to create reminders/agendas
2024-12-13 15:30:40 +01:00 · 2024-12-13 15:29:42 +01:00 · 2024-12-09 21:08:08 +01:00
4 changed files with 50 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+*.mbp
--- a/base-config.yaml
+++ b/base-config.yaml
@@ -14,5 +14,8 @@ allowed_rooms: []
 # Optional text to guide the model's style or to continue from a previous audio segment. The prompt should match the language of the audio.
 prompt: "以下是普通话录制的会议记录："

+#Optional should OpenAI re-read the transcript to detect reminders and events ?
+search_reminders_and_events: false
+
 # The language of the input audio. Providing the input language in ISO-639-1 format (Chinese: "zh") will improve accuracy and latency, leave empty by default
 language:
--- a/maubot.yaml
+++ b/maubot.yaml
@@ -1,6 +1,6 @@
 maubot: 0.1.0
 id: nigzu.com.maubot-stt
-version: 0.2.6
+version: 0.3.1
 license: MIT
 modules:
 - openai-whisper
--- a/openai-whisper.py
+++ b/openai-whisper.py
@@ -12,6 +12,9 @@ from mautrix.errors import MatrixRequestError
 from mautrix.types import EventType, MessageType, RelationType, TextMessageEventContent, Format,RelatesTo,InReplyTo
 from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper

+ALLOWED_EXTENSIONS = ['flac', 'm4a', 'mp3', 'mp4', 'mpeg', 'mpga', 'oga', 'ogg', 'wav', 'webm']
+ALLOWED_MIME_TYPES = ['audio/flac','audio/mp4','video/mpeg','audio/ogg','audio/wav','video/webm']
+
 class Config(BaseProxyConfig):
    def do_update(self, helper: ConfigUpdateHelper) -> None:
        helper.copy("whisper_endpoint")
@@ -19,6 +22,7 @@ class Config(BaseProxyConfig):
        helper.copy("allowed_users")
        helper.copy("allowed_rooms")
        helper.copy("prompt")
+        helper.copy("search_reminders_and_events")
        helper.copy("language")

 class WhisperPlugin(Plugin):
@@ -32,6 +36,7 @@ class WhisperPlugin(Plugin):
        self.language = self.config['language']
        self.allowed_users = self.config['allowed_users']
        self.allowed_rooms = self.config['allowed_rooms']
+        self.search_reminders_and_events = self.config['search_reminders_and_events']
        self.log.debug("Whisper plugin started")

    async def should_respond(self, event: MessageEvent) -> bool:
@@ -44,6 +49,9 @@ class WhisperPlugin(Plugin):
        if self.allowed_rooms and event.room_id not in self.allowed_rooms:
            return False
        
+        if event.content.info.mimetype not in ALLOWED_MIME_TYPES:
+            return False
+
        return event.content.msgtype == MessageType.AUDIO or event.content.msgtype == MessageType.FILE

    @event.on(EventType.ROOM_MESSAGE)
@@ -58,8 +66,10 @@ class WhisperPlugin(Plugin):
            audio_bytes = await self.client.download_media(url=event.content.url)
            transcription = await self.transcribe_audio(audio_bytes)
            
-            await self.client.set_typing(event.room_id, timeout=0)
+            if self.search_reminders_and_events:
+                transcription = await self.study_transcribe(transcription)

+            await self.client.set_typing(event.room_id, timeout=0)
            content = TextMessageEventContent(
                msgtype=MessageType.TEXT,
                body=transcription,
@@ -104,6 +114,40 @@ class WhisperPlugin(Plugin):
            self.log.exception(f"Failed to transcribe audio, msg: {e}")
            return "Sorry, an error occurred while transcribing the audio."

+    async def study_transcribe(self, transcription: str) -> str:
+        prompt = f"""
+        Voici la transcription du message vocal :
+
+        {transcription}
+        Ton objectif est d'analyser cette transcription afin de déterminer si l'utilisateur tente de créer un rappel ou un évènement.
+
+        - Si l'utilisateur essaie de créer un rappel, la sortie doit prendre la forme :
+        !rappel <date> <message>
+        - Si l'utilisateur essaie de créer un évènement, la sortie doit prendre la forme :
+        !agenda ##ROOM## <message>
+        - Si l'utilisateur ne cherche ni à créer un rappel ni un évènement, renvoie seulement la transcription telle quelle, sans ajout d'explication, de texte supplémentaire ou de ponctuation superflue.
+        Ne fournis aucun autre texte ni explication dans ta réponse, uniquement la sortie demandée.
+        """
+
+        url = "https://api.openai.com/v1/chat/completions"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+
+        data = {
+            "model": "gpt-4",
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.7
+        }
+
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, headers=headers, json=data) as response:
+                response_json = await response.json()
+                return response_json.get('choices', [])[0].get('message', {}).get('content', transcription)
+
    @classmethod
    def get_config_class(cls) -> Type[BaseProxyConfig]:
        return Config
Author	SHA1	Message	Date
MrRaph_	63b9d0c3db	feat: excluding .mbp files	2024-12-13 15:30:40 +01:00
MrRaph_	5ba28d859c	feat: checking file mime type before sending it to Whisper	2024-12-13 15:29:42 +01:00
MrRaph_	693472186d	feat: added transcription analysis to create reminders/agendas	2024-12-09 21:08:08 +01:00