Compare commits

...

7 Commits

Author SHA1 Message Date
MrRaph_
a25e166c14 . 2025-03-27 08:57:24 +01:00
MrRaph_
db470a5d28 . 2025-03-27 08:57:04 +01:00
MrRaph_
5872e0b288 fix: if no agenda/reminder should send original transcript 2025-03-26 08:03:30 +01:00
MrRaph_
698ebf3bef fixed mime types 2025-03-26 08:01:57 +01:00
MrRaph_
63b9d0c3db feat: excluding .mbp files 2024-12-13 15:30:40 +01:00
MrRaph_
5ba28d859c feat: checking file mime type before sending it to Whisper 2024-12-13 15:29:42 +01:00
MrRaph_
693472186d feat: added transcription analysis to create reminders/agendas 2024-12-09 21:08:08 +01:00
4 changed files with 79 additions and 12 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.mbp

View File

@@ -14,5 +14,8 @@ allowed_rooms: []
# Optional text to guide the model's style or to continue from a previous audio segment. The prompt should match the language of the audio.
prompt: "以下是普通话录制的会议记录:"
#Optional should OpenAI re-read the transcript to detect reminders and events ?
search_reminders_and_events: false
# The language of the input audio. Providing the input language in ISO-639-1 format (Chinese: "zh") will improve accuracy and latency, leave empty by default
language:

View File

@@ -1,6 +1,6 @@
maubot: 0.1.0
id: nigzu.com.maubot-stt
version: 0.2.6
version: 0.3.4
license: MIT
modules:
- openai-whisper

View File

@@ -9,9 +9,15 @@ from mautrix.client import Client
from maubot.handlers import event
from maubot import Plugin, MessageEvent
from mautrix.errors import MatrixRequestError
from mautrix.types import EventType, MessageType, RelationType, TextMessageEventContent, Format,RelatesTo,InReplyTo
from mautrix.types import (
EventType, MessageType, RelationType,
TextMessageEventContent, Format, RelatesTo, InReplyTo
)
from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper
ALLOWED_EXTENSIONS = ['flac', 'm4a', 'mp3', 'mp4', 'mpeg', 'mpga', 'oga', 'ogg', 'wav', 'webm']
ALLOWED_MIME_TYPES = ['audio/flac', 'audio/mp4', 'video/mpeg', 'audio/ogg', 'audio/wav', 'video/webm']
class Config(BaseProxyConfig):
def do_update(self, helper: ConfigUpdateHelper) -> None:
helper.copy("whisper_endpoint")
@@ -19,6 +25,7 @@ class Config(BaseProxyConfig):
helper.copy("allowed_users")
helper.copy("allowed_rooms")
helper.copy("prompt")
helper.copy("search_reminders_and_events")
helper.copy("language")
class WhisperPlugin(Plugin):
@@ -32,6 +39,7 @@ class WhisperPlugin(Plugin):
self.language = self.config['language']
self.allowed_users = self.config['allowed_users']
self.allowed_rooms = self.config['allowed_rooms']
self.search_reminders_and_events = self.config['search_reminders_and_events']
self.log.debug("Whisper plugin started")
async def should_respond(self, event: MessageEvent) -> bool:
@@ -44,7 +52,14 @@ class WhisperPlugin(Plugin):
if self.allowed_rooms and event.room_id not in self.allowed_rooms:
return False
return event.content.msgtype == MessageType.AUDIO or event.content.msgtype == MessageType.FILE
# Extraction de la partie principale du MIME type (avant les éventuels paramètres)
mime_type = ""
if event.content.info and event.content.info.mimetype:
mime_type = event.content.info.mimetype.split(";")[0]
if mime_type not in ALLOWED_MIME_TYPES:
return False
return event.content.msgtype in (MessageType.AUDIO, MessageType.FILE)
@event.on(EventType.ROOM_MESSAGE)
async def on_message(self, event: MessageEvent) -> None:
@@ -56,10 +71,24 @@ class WhisperPlugin(Plugin):
await self.client.set_typing(event.room_id, timeout=99999)
audio_bytes = await self.client.download_media(url=event.content.url)
transcription = await self.transcribe_audio(audio_bytes)
if not audio_bytes:
await event.respond("Erreur lors du téléchargement du fichier audio.")
return
# Récupère le nom de fichier s'il est défini, sinon utilise une valeur par défaut
filename = getattr(event.content, "filename", "audio.mp3")
# Utilise le MIME type tel quel, ou une valeur par défaut
mime_type = event.content.info.mimetype if event.content.info and event.content.info.mimetype else "audio/mpeg"
transcription = await self.transcribe_audio(audio_bytes, filename, mime_type)
# Si l'étude est activée, on utilise son résultat uniquement si celui-ci renvoie une commande
if self.search_reminders_and_events:
studied = await self.study_transcribe(transcription)
if studied.startswith("!rappel") or studied.startswith("!agenda"):
transcription = studied
await self.client.set_typing(event.room_id, timeout=0)
content = TextMessageEventContent(
msgtype=MessageType.TEXT,
body=transcription,
@@ -79,31 +108,65 @@ class WhisperPlugin(Plugin):
self.log.exception(f"Something went wrong: {e}")
await event.respond(f"Something went wrong: {e}")
async def transcribe_audio(self, audio_bytes: bytes) -> str:
async def transcribe_audio(self, audio_bytes: bytes, filename: str = "audio.mp3", mime_type: str = "audio/mpeg") -> str:
headers = {
"Authorization": f"Bearer {self.api_key}"
}
data = aiohttp.FormData()
data.add_field('file', audio_bytes, filename='audio.mp3', content_type='audio/mpeg')
data.add_field('file', audio_bytes, filename=filename, content_type=mime_type)
data.add_field('model', 'whisper-1')
if self.prompt:
data.add_field('prompt', f"{self.prompt}")
data.add_field('prompt', self.prompt)
if self.language:
data.add_field('language', f"{self.language}")
data.add_field('language', self.language)
try:
async with aiohttp.ClientSession() as session:
async with session.post(self.whisper_endpoint, headers=headers, data=data) as response:
if response.status != 200:
self.log.error(f"Error response from API: {await response.text()}")
return f"Error: {await response.text()}"
error_text = await response.text()
self.log.error(f"Error response from API: {error_text}")
return f"Error: {error_text}"
response_json = await response.json()
return response_json.get("text", "Sorry, I can't transcribe the audio.")
except Exception as e:
self.log.exception(f"Failed to transcribe audio, msg: {e}")
return "Sorry, an error occurred while transcribing the audio."
async def study_transcribe(self, transcription: str) -> str:
prompt = f"""
Voici la transcription du message vocal :
{transcription}
Ton objectif est d'analyser cette transcription afin de déterminer si l'utilisateur tente de créer un rappel ou un évènement.
- Si l'utilisateur essaie de créer un rappel, la sortie doit prendre la forme :
!rappel <date> <message>
- Si l'utilisateur essaie de créer un évènement, la sortie doit prendre la forme :
!agenda ##ROOM## <message>
- Si l'utilisateur ne cherche ni à créer un rappel ni un évènement, renvoie seulement la transcription telle quelle, sans ajout d'explication, de texte supplémentaire ou de ponctuation superflue.
Ne fournis aucun autre texte ni explication dans ta réponse, uniquement la sortie demandée.
"""
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
data = {
"model": "gpt-4",
"messages": [
{"role": "user", "content": prompt}
],
"temperature": 0.7
}
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, json=data) as response:
response_json = await response.json()
return response_json.get('choices', [])[0].get('message', {}).get('content', transcription)
@classmethod
def get_config_class(cls) -> Type[BaseProxyConfig]:
return Config