From 01f13e5ab70bd4bd757fe05cdf0349617825a0de Mon Sep 17 00:00:00 2001 From: ultrablob Date: Thu, 13 Feb 2025 19:06:40 -0500 Subject: [PATCH 1/2] use same model instead of instantiating a new one to reserve vram --- main.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index eb2e8e6..d6a4720 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import pyautogui as pg from pynput import keyboard import speech_recognition as sr +from faster_whisper import WhisperModel from string import punctuation from slang import replacements import re @@ -8,22 +9,26 @@ import subprocess from time import sleep from mss import mss import numpy as np +from io import BytesIO import sounddevice # turns off alsa error logging r = sr.Recognizer() +model = WhisperModel("distil-small.en", device="cuda", compute_type="float16") + +print("Testing Sound") with sr.Microphone() as source: r.adjust_for_ambient_noise(source, duration=3) +print("ready!") def recognize_text() -> str: with sr.Microphone() as source: - print("Say something!") audio = r.listen(source) - result = r.recognize_faster_whisper(audio, model="distil-small.en", beam_size=5, language="en", condition_on_previous_text=False) + results, _ = model.transcribe(BytesIO(audio.get_wav_data()), beam_size=5, language="en", condition_on_previous_text=False) - return result + return " ".join([segment.text for segment in results]) def chat_type(): @@ -49,13 +54,13 @@ def on_press(key): if key is not keyboard.Key.home: return - print("triggered!") + print("Listening...") command = recognize_text() print(f"Heard: {command}") # cleanup command - command = command.lower() + command = command.lower().strip() for char in punctuation: command = command.replace(char, '') @@ -86,7 +91,7 @@ def on_press(key): # sleep(0.074) # pg.keyUp("enter") - elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon"]): + elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon", "obliterate", "delete", "piss"]): print("MAXIMUM PULSE!!!!") pg.keyDown("q") sleep(0.032) From 7f5d221c56ecb883f7a648000f79a20fc5a93c4a Mon Sep 17 00:00:00 2001 From: ultrablob Date: Thu, 13 Feb 2025 20:00:21 -0500 Subject: [PATCH 2/2] add config file and minor refactoring --- README.md | 7 ++++++- config.py | 16 ++++++++++++++++ main.py | 55 ++++++++++++++++++++++++++++++++++--------------------- slang.py | 6 ------ 4 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 config.py delete mode 100644 slang.py diff --git a/README.md b/README.md index c9b7a50..7670ac6 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,9 @@ Press the hotkey (default is Home) and then talk to jarvis - "jarvis, clip that" - "hey jarvis, thats a clip" -Checkout the code to see specific keywords/phrases as all NLP is regex/string based, not generative AI \ No newline at end of file +Checkout the code to see specific keywords/phrases as all NLP is regex/string based, not generative AI + + +### Resource Usage + +With the `distil-small.en` model, on my system it uses about 500mb of VRAM diff --git a/config.py b/config.py new file mode 100644 index 0000000..399fd8e --- /dev/null +++ b/config.py @@ -0,0 +1,16 @@ +model = "distil-small.en" + +replacements = {"gigi": "gg", "heels": "heals", "heeling": "healing", "heel": "heal"} + +maximum_pulse = [ + "maximum", + "pulse", + "ball", + "remove", + "eliminate", + "murder", + "goon", + "obliterate", + "delete", + "piss", +] \ No newline at end of file diff --git a/main.py b/main.py index d6a4720..6f864c9 100644 --- a/main.py +++ b/main.py @@ -3,38 +3,49 @@ from pynput import keyboard import speech_recognition as sr from faster_whisper import WhisperModel from string import punctuation -from slang import replacements +import config import re import subprocess from time import sleep from mss import mss import numpy as np from io import BytesIO -import sounddevice # turns off alsa error logging +import sounddevice # turns off alsa error logging r = sr.Recognizer() -model = WhisperModel("distil-small.en", device="cuda", compute_type="float16") +r.pause_threshold = 2 + +model = WhisperModel(config.model, device="cuda", compute_type="int8_float16") print("Testing Sound") with sr.Microphone() as source: r.adjust_for_ambient_noise(source, duration=3) print("ready!") + def recognize_text() -> str: with sr.Microphone() as source: audio = r.listen(source) - results, _ = model.transcribe(BytesIO(audio.get_wav_data()), beam_size=5, language="en", condition_on_previous_text=False) + results, _ = model.transcribe( + BytesIO(audio.get_wav_data()), + beam_size=5, + language="en", + condition_on_previous_text=False, + ) return " ".join([segment.text for segment in results]) + def chat_type(): - screen = mss() #! bad for performance but necessary + screen = mss() #! bad for performance but necessary - screenshot = np.array(screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100})) + screenshot = np.array( + screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100}) + ) try: pg.locate("ui/team-chat.png", screenshot, confidence=0.9) return "team" @@ -49,11 +60,12 @@ def chat_type(): return None + def on_press(key): if key is not keyboard.Key.home: return - + print("Listening...") command = recognize_text() @@ -62,25 +74,29 @@ def on_press(key): # cleanup command command = command.lower().strip() for char in punctuation: - command = command.replace(char, '') + command = command.replace(char, "") - for original, new in replacements.items(): + for original, new in config.replacements.items(): command = command.replace(original, new) print(f"Cleaned up command: {command}") - if "chat" in command: - message = re.search(r"type (.+?)(and |in |\n|$)", command).groups(0)[0].strip() - print(f"Typing: {message} in team chat") - + if any(keyword in command for keyword in ["type", "say", "write"]): + message = ( + re.search(r"(type|say|write) (.+?)(and |in |\n|$)", command) + .groups(0)[1] + .strip() + ) + print(f"Typing: {message} in chat") + pg.keyDown("enter") sleep(0.041) pg.keyUp("enter") sleep(0.94) current_chat = chat_type() - if current_chat in command: - pass # no change needed + if current_chat == None or current_chat in command: + pass # no change needed elif "match" in command or "team" in command: pg.keyDown("tab") sleep(0.041) @@ -91,7 +107,7 @@ def on_press(key): # sleep(0.074) # pg.keyUp("enter") - elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon", "obliterate", "delete", "piss"]): + elif any(keyword in command for keyword in config.maximum_pulse): print("MAXIMUM PULSE!!!!") pg.keyDown("q") sleep(0.032) @@ -100,10 +116,7 @@ def on_press(key): elif "clip" in command: subprocess.run("/home/ultrablob/Videos/Clips/save_clip.sh") - # Collect events until released -with keyboard.Listener( - on_press=on_press, - on_release=lambda event: None) as listener: - listener.join() \ No newline at end of file +with keyboard.Listener(on_press=on_press, on_release=lambda event: None) as listener: + listener.join() diff --git a/slang.py b/slang.py deleted file mode 100644 index 906ba55..0000000 --- a/slang.py +++ /dev/null @@ -1,6 +0,0 @@ -replacements = { - "gigi": "gg", - "heels": "heals", - "heeling": "healing", - "heel": "heal" -} \ No newline at end of file