From 01f13e5ab70bd4bd757fe05cdf0349617825a0de Mon Sep 17 00:00:00 2001
From: ultrablob <neelgillshah@gmail.com>
Date: Thu, 13 Feb 2025 19:06:40 -0500
Subject: [PATCH 1/2] use same model instead of instantiating a new one to
 reserve vram

---
 main.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index eb2e8e6..d6a4720 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,7 @@
 import pyautogui as pg
 from pynput import keyboard
 import speech_recognition as sr
+from faster_whisper import WhisperModel
 from string import punctuation
 from slang import replacements
 import re
@@ -8,22 +9,26 @@ import subprocess
 from time import sleep
 from mss import mss
 import numpy as np
+from io import BytesIO
 import sounddevice # turns off alsa error logging
 
 r = sr.Recognizer()
 
+model = WhisperModel("distil-small.en", device="cuda", compute_type="float16")
+
+print("Testing Sound")
 with sr.Microphone() as source:
     r.adjust_for_ambient_noise(source, duration=3)
+print("ready!")
 
 def recognize_text() -> str:
 
     with sr.Microphone() as source:
-        print("Say something!")
         audio = r.listen(source)
 
-        result = r.recognize_faster_whisper(audio, model="distil-small.en", beam_size=5, language="en", condition_on_previous_text=False)
+        results, _ = model.transcribe(BytesIO(audio.get_wav_data()), beam_size=5, language="en", condition_on_previous_text=False)
 
-    return result
+    return " ".join([segment.text for segment in results])
 
 def chat_type():
 
@@ -49,13 +54,13 @@ def on_press(key):
     if key is not keyboard.Key.home:
         return
     
-    print("triggered!")
+    print("Listening...")
     command = recognize_text()
 
     print(f"Heard: {command}")
 
     # cleanup command
-    command = command.lower()
+    command = command.lower().strip()
     for char in punctuation:
         command = command.replace(char, '')
 
@@ -86,7 +91,7 @@ def on_press(key):
         # sleep(0.074)
         # pg.keyUp("enter")
 
-    elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon"]):
+    elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon", "obliterate", "delete", "piss"]):
         print("MAXIMUM PULSE!!!!")
         pg.keyDown("q")
         sleep(0.032)

From 7f5d221c56ecb883f7a648000f79a20fc5a93c4a Mon Sep 17 00:00:00 2001
From: ultrablob <neelgillshah@gmail.com>
Date: Thu, 13 Feb 2025 20:00:21 -0500
Subject: [PATCH 2/2] add config file and minor refactoring

---
 README.md |  7 ++++++-
 config.py | 16 ++++++++++++++++
 main.py   | 55 ++++++++++++++++++++++++++++++++++---------------------
 slang.py  |  6 ------
 4 files changed, 56 insertions(+), 28 deletions(-)
 create mode 100644 config.py
 delete mode 100644 slang.py

diff --git a/README.md b/README.md
index c9b7a50..7670ac6 100644
--- a/README.md
+++ b/README.md
@@ -35,4 +35,9 @@ Press the hotkey (default is Home) and then talk to jarvis
     - "jarvis, clip that"
     - "hey jarvis, thats a clip"
 
-Checkout the code to see specific keywords/phrases as all NLP is regex/string based, not generative AI
\ No newline at end of file
+Checkout the code to see specific keywords/phrases as all NLP is regex/string based, not generative AI
+
+
+### Resource Usage
+
+With the `distil-small.en` model, on my system it uses about 500mb of VRAM
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..399fd8e
--- /dev/null
+++ b/config.py
@@ -0,0 +1,16 @@
+model = "distil-small.en"
+
+replacements = {"gigi": "gg", "heels": "heals", "heeling": "healing", "heel": "heal"}
+
+maximum_pulse = [
+    "maximum",
+    "pulse",
+    "ball",
+    "remove",
+    "eliminate",
+    "murder",
+    "goon",
+    "obliterate",
+    "delete",
+    "piss",
+]
\ No newline at end of file
diff --git a/main.py b/main.py
index d6a4720..6f864c9 100644
--- a/main.py
+++ b/main.py
@@ -3,38 +3,49 @@ from pynput import keyboard
 import speech_recognition as sr
 from faster_whisper import WhisperModel
 from string import punctuation
-from slang import replacements
+import config
 import re
 import subprocess
 from time import sleep
 from mss import mss
 import numpy as np
 from io import BytesIO
-import sounddevice # turns off alsa error logging
+import sounddevice  # turns off alsa error logging
 
 r = sr.Recognizer()
 
-model = WhisperModel("distil-small.en", device="cuda", compute_type="float16")
+r.pause_threshold = 2
+
+model = WhisperModel(config.model, device="cuda", compute_type="int8_float16")
 
 print("Testing Sound")
 with sr.Microphone() as source:
     r.adjust_for_ambient_noise(source, duration=3)
 print("ready!")
 
+
 def recognize_text() -> str:
 
     with sr.Microphone() as source:
         audio = r.listen(source)
 
-        results, _ = model.transcribe(BytesIO(audio.get_wav_data()), beam_size=5, language="en", condition_on_previous_text=False)
+        results, _ = model.transcribe(
+            BytesIO(audio.get_wav_data()),
+            beam_size=5,
+            language="en",
+            condition_on_previous_text=False,
+        )
 
     return " ".join([segment.text for segment in results])
 
+
 def chat_type():
 
-    screen = mss() #! bad for performance but necessary
+    screen = mss()  #! bad for performance but necessary
 
-    screenshot = np.array(screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100}))
+    screenshot = np.array(
+        screen.grab({"top": 1090, "left": 1110, "width": 100, "height": 100})
+    )
     try:
         pg.locate("ui/team-chat.png", screenshot, confidence=0.9)
         return "team"
@@ -49,11 +60,12 @@ def chat_type():
 
     return None
 
+
 def on_press(key):
 
     if key is not keyboard.Key.home:
         return
-    
+
     print("Listening...")
     command = recognize_text()
 
@@ -62,25 +74,29 @@ def on_press(key):
     # cleanup command
     command = command.lower().strip()
     for char in punctuation:
-        command = command.replace(char, '')
+        command = command.replace(char, "")
 
-    for original, new in replacements.items():
+    for original, new in config.replacements.items():
         command = command.replace(original, new)
 
     print(f"Cleaned up command: {command}")
 
-    if "chat" in command:
-        message = re.search(r"type (.+?)(and |in |\n|$)", command).groups(0)[0].strip()
-        print(f"Typing: {message} in team chat")
-        
+    if any(keyword in command for keyword in ["type", "say", "write"]):
+        message = (
+            re.search(r"(type|say|write) (.+?)(and |in |\n|$)", command)
+            .groups(0)[1]
+            .strip()
+        )
+        print(f"Typing: {message} in chat")
+
         pg.keyDown("enter")
         sleep(0.041)
         pg.keyUp("enter")
         sleep(0.94)
 
         current_chat = chat_type()
-        if current_chat in command:
-            pass # no change needed
+        if current_chat == None or current_chat in command:
+            pass  # no change needed
         elif "match" in command or "team" in command:
             pg.keyDown("tab")
             sleep(0.041)
@@ -91,7 +107,7 @@ def on_press(key):
         # sleep(0.074)
         # pg.keyUp("enter")
 
-    elif any(keyword in command for keyword in ["maximum", "pulse", "balls", "remove", "eliminate", "murder", "goon", "obliterate", "delete", "piss"]):
+    elif any(keyword in command for keyword in config.maximum_pulse):
         print("MAXIMUM PULSE!!!!")
         pg.keyDown("q")
         sleep(0.032)
@@ -100,10 +116,7 @@ def on_press(key):
     elif "clip" in command:
         subprocess.run("/home/ultrablob/Videos/Clips/save_clip.sh")
 
-    
 
 # Collect events until released
-with keyboard.Listener(
-        on_press=on_press,
-        on_release=lambda event: None) as listener:
-    listener.join()
\ No newline at end of file
+with keyboard.Listener(on_press=on_press, on_release=lambda event: None) as listener:
+    listener.join()
diff --git a/slang.py b/slang.py
deleted file mode 100644
index 906ba55..0000000
--- a/slang.py
+++ /dev/null
@@ -1,6 +0,0 @@
-replacements = {
-    "gigi": "gg",
-    "heels": "heals",
-    "heeling": "healing",
-    "heel": "heal"
-}
\ No newline at end of file