add config file and minor refactoring

use same model instead of instantiating a new one to reserve vram
2025-02-13 20:00:21 -05:00 · 2025-02-13 19:06:40 -05:00
4 changed files with 64 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -36,3 +36,8 @@ Press the hotkey (default is Home) and then talk to jarvis
    - "hey jarvis, thats a clip"
 Checkout the code to see specific keywords/phrases as all NLP is regex/string based, not generative AI
 ### Resource Usage
 With the `distil-small.en` model, on my system it uses about 500mb of VRAM
--- a/config.py
+++ b/config.py
@ -0,0 +1,16 @@
 model = "distil-small.en"
 replacements = {"gigi": "gg", "heels": "heals", "heeling": "healing", "heel": "heal"}
 maximum_pulse = [
    "maximum",
    "pulse",
    "ball",
    "remove",
    "eliminate",
    "murder",
    "goon",
    "obliterate",
    "delete",
    "piss",
 ]
--- a/main.py
+++ b/main.py
@ -1,35 +1,51 @@
 import pyautogui as pg
 from pynput import keyboard
 import speech_recognition as sr
 from faster_whisper import WhisperModel
 from string import punctuation
-from slang import replacements
+import config
 import re
 import subprocess
 from time import sleep
 from mss import mss
 import numpy as np
-import sounddevice # turns off alsa error logging
+from io import BytesIO
 import sounddevice  # turns off alsa error logging
 r = sr.Recognizer()
 r.pause_threshold = 2
 model = WhisperModel(config.model, device="cuda", compute_type="int8_float16")
 print("Testing Sound")
 with sr.Microphone() as source:
    r.adjust_for_ambient_noise(source, duration=3)
 print("ready!")
 def recognize_text() -> str:
    with sr.Microphone() as source:
        print("Say something!")
        audio = r.listen(source)
-        result = r.recognize_faster_whisper(audio, model="distil-small.en", beam_size=5, language="en", condition_on_previous_text=False)
+        results, _ = model.transcribe(
            BytesIO(audio.get_wav_data()),
            beam_size=5,
            language="en",
            condition_on_previous_text=False,
        )
    return " ".join([segment.text for segment in results])
    return result
 def chat_type():
-    screen = mss() #! bad for performance but necessary
+    screen = mss()  #! bad for performance but necessary
-    screenshot = np.array(screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100}))
+    screenshot = np.array(
        screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100})
    )
    try:
        pg.locate("ui/team-chat.png", screenshot, confidence=0.9)
        return "team"
@ -44,29 +60,34 @@ def chat_type():
    return None
 def on_press(key):
    if key is not keyboard.Key.home:
        return
-    print("triggered!")
+    print("Listening...")
    command = recognize_text()
    print(f"Heard: {command}")
    # cleanup command
-    command = command.lower()
+    command = command.lower().strip()
    for char in punctuation:
-        command = command.replace(char, '')
+        command = command.replace(char, "")
-    for original, new in replacements.items():
+    for original, new in config.replacements.items():
        command = command.replace(original, new)
    print(f"Cleaned up command: {command}")
-    if "chat" in command:
+    if any(keyword in command for keyword in ["type", "say", "write"]):
-        message = re.search(r"type (.+?)(and |in |\n|$)", command).groups(0)[0].strip()
+        message = (
-        print(f"Typing: {message} in team chat")
+            re.search(r"(type|say|write) (.+?)(and |in |\n|$)", command)
            .groups(0)[1]
            .strip()
        )
        print(f"Typing: {message} in chat")
        pg.keyDown("enter")
        sleep(0.041)
@ -74,8 +95,8 @@ def on_press(key):
        sleep(0.94)
        current_chat = chat_type()
-        if current_chat in command:
+        if current_chat == None or current_chat in command:
-            pass # no change needed
+            pass  # no change needed
        elif "match" in command or "team" in command:
            pg.keyDown("tab")
            sleep(0.041)
@ -86,7 +107,7 @@ def on_press(key):
        # sleep(0.074)
        # pg.keyUp("enter")
-    elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon"]):
+    elif any(keyword in command for keyword in config.maximum_pulse):
        print("MAXIMUM PULSE!!!!")
        pg.keyDown("q")
        sleep(0.032)
@ -96,9 +117,6 @@ def on_press(key):
        subprocess.run("/home/ultrablob/Videos/Clips/save_clip.sh")
 # Collect events until released
-with keyboard.Listener(
+with keyboard.Listener(on_press=on_press, on_release=lambda event: None) as listener:
        on_press=on_press,
        on_release=lambda event: None) as listener:
    listener.join()
--- a/slang.py
+++ b/slang.py
@ -1,6 +0,0 @@
 replacements = {
    "gigi": "gg",
    "heels": "heals",
    "heeling": "healing",
    "heel": "heal"
 }
Author	SHA1	Message	Date
ultrablob	7f5d221c56	add config file and minor refactoring	2025-02-13 20:00:21 -05:00
ultrablob	01f13e5ab7	use same model instead of instantiating a new one to reserve vram	2025-02-13 19:06:40 -05:00