diff --git a/README.md b/README.md new file mode 100644 index 0000000..abccfc5 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +English | [中文](README_ZH.md) + +# Whisper Plugin for Maubot + +Whisper Plugin is a plugin for Maubot, Support for transcriptions of audio messages to text in Matrix clients using OpenAI's Whisper API + +## Features + +- Automatically responds to audio and file messages in rooms +- Calls OpenAI Whisper API for audio transcription +- Supports user and room whitelists +- Support reply in thread + +![alt text](sample1.png) + +## Installation + +1. **Clone or download the plugin code**: + + ```bash + git clone + cd + zip -9r maubot-stt.mbp * + ``` + +2. **Configure Maubot**: + + Make sure you have installed and set up Maubot. Refer to [Maubot's official documentation](https://docs.mau.fi/maubot/usage/basic.html) for detailed steps. + +3. **Upload the plugin**: + + Upload the plugin in the Maubot management interface. + +## Configuration + +After uploading the plugin, you need to configure it. Here are the configuration items: + +- `api_endpoint`: OpenAI Whisper API endpoint, default is `https://api.openai.com/v1/audio/transcriptions`. +- `openai_api_key`: Your OpenAI API key. +- `allowed_users`: List of users allowed to use this plugin. If empty, all users are allowed. +- `allowed_rooms`: List of rooms allowed to use this plugin. If empty, all rooms are allowed. + +### Configuration Example + +In the Maubot management interface, go to the plugin's configuration page and fill in the following content: + +```json +{ + "api_endpoint": "https://api.openai.com/v1/audio/transcriptions", + "openai_api_key": "your_openai_api_key", + "allowed_users": ["@user1:matrix.org", "@user2:matrix.org"], + "allowed_rooms": ["!roomid:matrix.org"] +} +``` + +It is recommended to close and reopen the instance configuration after saving. + +## Usage +The plugin will automatically listen to messages in the room and perform the following actions upon receiving a voice message or audio file: + +1. Download the audio file. +2. Call the OpenAI Whisper API for transcription. +3. Send the transcription result as a text message to the corresponding room. + +## License + +This project is licensed under the MIT License. \ No newline at end of file diff --git a/README_ZH.md b/README_ZH.md new file mode 100644 index 0000000..61dd519 --- /dev/null +++ b/README_ZH.md @@ -0,0 +1,69 @@ +[English](README.md) | 中文 + +# Whisper Plugin for Maubot + +Whisper Plugin 是一个用于 Maubot 的插件,支持在 Matrix 客户端用 OpenAI 的 Whisper API 将音频消息转录为文本。 + +## 功能 + +- 自动响应房间中的音频和文件消息 +- 调用 OpenAI Whisper API 进行音频转录 +- 支持用户和房间白名单 +- 支持在线程中回复 + +![alt text](sample1.png) + +## 安装 + +1. **克隆或下载插件代码**: + + ```bash + git clone + cd + zip -9r maubot-stt.mbp * + ``` + +2. **配置 Maubot**: + + 请确保你已经安装并设置好了 Maubot。具体步骤可以参考 [Maubot 的官方文档](https://docs.mau.fi/maubot/usage/basic.html)。 + +3. **上传插件**: + + 在 Maubot 管理界面上传插件。 + +## 配置 + +上传插件后,你需要进行一些配置。以下是配置项的说明: + +- `api_endpoint`:OpenAI Whisper API 端点,默认为 `https://api.openai.com/v1/audio/transcriptions`。 +- `openai_api_key`:你的 OpenAI API 密钥。 +- `allowed_users`:允许使用此插件的用户列表。如果为空列表,则允许所有用户使用。 +- `allowed_rooms`:允许使用此插件的房间列表。如果为空列表,则允许所有房间使用。 + +### 配置示例 + +在 Maubot 管理界面,进入插件的配置页面,填写以下内容: + +```json +{ + "api_endpoint": "https://api.openai.com/v1/audio/transcriptions", + "openai_api_key": "your_openai_api_key", + "allowed_users": ["@user1:matrix.org", "@user2:matrix.org"], + "allowed_rooms": ["!roomid:matrix.org"] +} +``` + +建议关闭实例配置保存后再打开。 + +## 使用 + +插件会自动监听房间中的消息,当在接收到语音消息或音频文件时进行以下操作: + +1. 下载音频文件。 +2. 调用 OpenAI Whisper API 进行转录。 +3. 将转录结果以文本消息的形式发送到相应的房间。 + + +## 许可证 + +本项目采用 MIT 许可证。 \ No newline at end of file diff --git a/base-config.yaml b/base-config.yaml new file mode 100644 index 0000000..249b9bb --- /dev/null +++ b/base-config.yaml @@ -0,0 +1,18 @@ +openai_api_key: sk- + +# API endpoint to connect to +whisper_endpoint: https://api.openai.com/v1/audio/transcriptions + +# List of allowed users +# [] means no restriction +allowed_users: [] + +# List of allowed rooms +# [] means no restriction +allowed_rooms: [] + +# Optional text to guide the model's style or to continue from a previous audio segment. The prompt should match the language of the audio. +prompt: "以下是普通话录制的会议记录:" + +# The language of the input audio. Providing the input language in ISO-639-1 format (Chinese: "zh") will improve accuracy and latency, leave empty by default +language: diff --git a/maubot.yaml b/maubot.yaml new file mode 100644 index 0000000..b8d0706 --- /dev/null +++ b/maubot.yaml @@ -0,0 +1,11 @@ +maubot: 0.1.0 +id: nigzu.com.maubot-stt +version: 0.2.6 +license: MIT +modules: +- openai-whisper +main_class: WhisperPlugin +config: true +extra_files: +- base-config.yaml +database: false diff --git a/openai-whisper.py b/openai-whisper.py new file mode 100644 index 0000000..bc4aa36 --- /dev/null +++ b/openai-whisper.py @@ -0,0 +1,117 @@ +import asyncio +import json +import os +import re +import aiohttp +from datetime import datetime +from typing import Type +from mautrix.client import Client +from maubot.handlers import event +from maubot import Plugin, MessageEvent +from mautrix.errors import MatrixRequestError +from mautrix.types import EventType, MessageType, RelationType, TextMessageEventContent, Format,RelatesTo,InReplyTo +from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper + +class Config(BaseProxyConfig): + def do_update(self, helper: ConfigUpdateHelper) -> None: + helper.copy("whisper_endpoint") + helper.copy("openai_api_key") + helper.copy("allowed_users") + helper.copy("allowed_rooms") + helper.copy("prompt") + helper.copy("language") + +class WhisperPlugin(Plugin): + + async def start(self) -> None: + await super().start() + self.config.load_and_update() + self.whisper_endpoint = self.config['whisper_endpoint'] + self.api_key = self.config['openai_api_key'] + self.prompt = self.config['prompt'] + self.language = self.config['language'] + self.allowed_users = self.config['allowed_users'] + self.allowed_rooms = self.config['allowed_rooms'] + self.log.debug("Whisper plugin started") + + async def should_respond(self, event: MessageEvent) -> bool: + if event.sender == self.client.mxid: + return False + + if self.allowed_users and event.sender not in self.allowed_users: + return False + + if self.allowed_rooms and event.room_id not in self.allowed_rooms: + return False + + return event.content.msgtype == MessageType.AUDIO or event.content.msgtype == MessageType.FILE + + @event.on(EventType.ROOM_MESSAGE) + async def on_message(self, event: MessageEvent) -> None: + if not await self.should_respond(event): + return + + try: + await event.mark_read() + await self.client.set_typing(event.room_id, timeout=99999) + + audio_bytes = await self.client.download_media(url=event.content.url) + transcription = await self.transcribe_audio(audio_bytes) + + await self.client.set_typing(event.room_id, timeout=0) + + content = TextMessageEventContent( + msgtype=MessageType.TEXT, + body=transcription, + format=Format.HTML, + formatted_body=transcription + ) + in_reply_to = InReplyTo(event_id=event.event_id) + if event.content.relates_to and event.content.relates_to.rel_type == RelationType.THREAD: + await event.respond(content, in_thread=True) + else: + content.relates_to = RelatesTo( + in_reply_to=in_reply_to + ) + await event.respond(content) + + except Exception as e: + self.log.exception(f"Something went wrong: {e}") + await event.respond(f"Something went wrong: {e}") + + + async def transcribe_audio(self, audio_bytes: bytes) -> str: + headers = { + "Authorization": f"Bearer {self.api_key}" + } + data = aiohttp.FormData() + data.add_field('file', audio_bytes, filename='audio.mp3', content_type='audio/mpeg') + data.add_field('model', 'whisper-1') + if self.prompt: + data.add_field('prompt', f"{self.prompt}") + if self.language: + data.add_field('language', f"{self.language}") + + try: + async with aiohttp.ClientSession() as session: + async with session.post(self.whisper_endpoint, headers=headers, data=data) as response: + if response.status != 200: + self.log.error(f"Error response from API: {await response.text()}") + return f"Error: {await response.text()}" + response_json = await response.json() + return response_json.get("text", "Sorry, I can't transcribe the audio.") + except Exception as e: + self.log.exception(f"Failed to transcribe audio, msg: {e}") + return "Sorry, an error occurred while transcribing the audio." + + @classmethod + def get_config_class(cls) -> Type[BaseProxyConfig]: + return Config + + def save_config(self) -> None: + self.config.save() + + async def update_config(self, new_config: dict) -> None: + self.config.update(new_config) + self.save_config() + self.log.debug("Configuration updated and saved") diff --git a/sample1.png b/sample1.png new file mode 100644 index 0000000..a938125 Binary files /dev/null and b/sample1.png differ