diff --git a/Dockerfile b/Dockerfile
index 9526de9..5cf47fa 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,14 +3,14 @@ FROM python:3.11-slim
 RUN --mount=type=cache,target=/root/.cache/pip pip install -U pip
 
 ARG TARGETPLATFORM
-RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg libaio-dev
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
 RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
 RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
 ENV PATH="/root/.cargo/bin:${PATH}"
 # for deepspeed support - doesn't seem worth it, image +7.5GB, over the 10GB ghcr.io limit, and no noticable gain in speed or VRAM usage?
 #RUN curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.1-1_all.deb
 #RUN dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb
-#RUN apt-get update && apt-get install --no-install-recommends -y build-essential cuda-toolkit
+#RUN apt-get update && apt-get install --no-install-recommends -y libaio-dev build-essential cuda-toolkit
 #ENV CUDA_HOME=/usr/local/cuda
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 
diff --git a/README.md b/README.md
index b7f5f0d..f26608a 100644
--- a/README.md
+++ b/README.md
@@ -23,15 +23,17 @@ Details:
   * Configurable [generation parameters](#generation-parameters)
   * Streamed output while generating
 * Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
+* Tested with python 3.9-3.11, piper does not install on python 3.12 yet
 
 
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
 
 ## Recent Changes
 
-Version 0.15.2, 2024-06-28
+Version 0.16.0, 2024-06-29
+
+* Multi-client safe version. Audio generation is synchronized in a single process. The estimated 'realtime' factor of XTTS on a GPU is roughly 1/3, this means that multiple streams simultaneously, or `speed` over 2, may experience audio underrun (delays or pauses in playback). This makes multiple clients possible and safe, but in practice 2 or 3 simultaneous streams is the maximum without audio underrun.
 
-* Thread safe version, with audio generation synchronized at the sentence level. The estimated 'realtime' factor of XTTS of roughly 1/3, this means that multiple streams simultaneously, or `speed` over 2, may experience audio underrun (ie. delays in playback)
 
 Version 0.15.1, 2024-06-27
 
diff --git a/speech.py b/speech.py
index 36340d0..6cbeeb5 100755
--- a/speech.py
+++ b/speech.py
@@ -1,9 +1,7 @@
 #!/usr/bin/env python3
 import argparse
-import asyncio
 import contextlib
 import gc
-import io
 import os
 import queue
 import re
@@ -73,12 +71,8 @@ def __init__(self, model_name, device, model_path=None, unload_timer=None):
 
         if self.unload_timer:
             logger.info(f"Setting unload timer to {self.unload_timer} seconds")
-            self.not_idle()
-            self.check_idle()
-
-    def not_idle(self):
-        with self.lock:
             self.last_used = time.time()
+            self.check_idle()
 
     def check_idle(self):
         with self.lock:
@@ -92,22 +86,28 @@ def check_idle(self):
                 self.timer.start()
 
     def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
-        logger.debug(f"waiting lock")
-        with self.lock, torch.no_grad(): # I wish this could be another way, but it seems that inference_stream cannot be access async reliably
-            logger.debug(f"grabbed lock, tts text: {text}")
+        with torch.no_grad():
             self.last_used = time.time()
+            tokens = 0
             try:
-                gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav
-
-                for wav in self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs):
-                    yield wav.cpu().numpy().tobytes()
+                with self.lock:
+                    gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
+                    pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
+                    self.last_used = time.time()
+
+                while True:
+                    with self.lock:
+                        yield next(pcm_stream).cpu().numpy().tobytes()
+                        self.last_used = time.time()
+                    tokens += 1
+
+            except StopIteration:
+                pass
 
             finally:
-                logger.debug(f"held lock for {time.time() - self.last_used:0.1f} sec")
+                logger.debug(f"Generated {tokens} tokens in {time.time() - self.last_used:.2f}s @ {tokens / (time.time() - self.last_used):.2f} T/s")
                 self.last_used = time.time()
 
-
-
 def default_exists(filename: str):
     if not os.path.exists(filename):
         fpath, ext = os.path.splitext(filename)
@@ -203,13 +203,12 @@ async def generate_speech(request: GenerateSpeechRequest):
     elif response_format == "pcm":
         if model == 'tts-1': # piper
             media_type = "audio/pcm;rate=22050"
-        elif model == 'tts-1-hd':
+        elif model == 'tts-1-hd': # xtts
             media_type = "audio/pcm;rate=24000"
     else:
         raise BadRequestError(f"Invalid response_format: '{response_format}'", param='response_format')
 
     ffmpeg_args = None
-    tts_io_out = None
 
     # Use piper for tts-1, and if xtts_device == none use for all models.
     if model == 'tts-1' or args.xtts_device == 'none':
@@ -289,77 +288,9 @@ async def generate_speech(request: GenerateSpeechRequest):
 
         ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 
-        # before the xtts lock, it was:
-        #def generator():
-        #    for chunk in xtts.tts(text=input_text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
-        #        ffmpeg_proc.stdin.write(chunk) # <-- but this blocks forever and holds the xtts lock if a client disconnects
-        #worker = threading.Thread(target=generator)
-        #worker.daemon = True
-        #worker.start()
-        #return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
-        # 
-        # What follows is stupidly overcomplicated, but there is no other way I can find (yet) that detects client disconnects and not get blocked up 
-    
-        os.set_blocking(ffmpeg_proc.stdout.fileno(), False) # this doesn't work on windows until python 3.12
-        os.set_blocking(ffmpeg_proc.stdin.fileno(), False) # this doesn't work on windows until python 3.12
-        ffmpeg_in = io.FileIO(ffmpeg_proc.stdin.fileno(), 'wb')
-
         in_q = queue.Queue() # speech pcm 
-        out_q = queue.Queue() # ffmpeg audio out
         ex_q = queue.Queue() # exceptions
 
-        def ffmpeg_io():
-            # in_q -> ffmopeg -> out_q
-            while not (ffmpeg_proc.stdout.closed and ffmpeg_proc.stdin.closed):
-                try:
-                    while not ffmpeg_proc.stdout.closed:
-                        chunk = ffmpeg_proc.stdout.read()
-                        if chunk is None:
-                            break
-
-                        if len(chunk) == 0: # real end
-                            out_q.put(None)
-                            ffmpeg_proc.stdout.close()
-                            break
-
-                        out_q.put(chunk)
-                        continue # consume audio without delay
-
-                except Exception as e:
-                    logger.debug(f"ffmpeg stdout read: {repr(e)}")
-                    out_q.put(None)
-                    ex_q.put(e)
-                    return
-
-                try:
-                    while not ffmpeg_proc.stdin.closed:
-                        chunk = in_q.get_nowait()
-                        if chunk is None:
-                            ffmpeg_proc.stdin.close()
-                            break
-                        n = ffmpeg_in.write(chunk) # BrokenPipeError from here on client disconnect
-                        if n is None:
-                            in_q.queue.appendleft(chunk)
-                            break
-                        if n != len(chunk):
-                            in_q.queue.appendleft(chunk[n:])
-                            break
-
-                except queue.Empty:
-                    pass
-
-                except BrokenPipeError as e:
-                    ex_q.put(e) # we need to get this exception into the generation loop, which holds the lock
-                    ffmpeg_proc.kill()
-                    return
-
-                except Exception as e:
-                    ex_q.put(e)
-                    ffmpeg_proc.kill()
-                    return
-
-                time.sleep(0.01)
-
         def exception_check(exq: queue.Queue):
             try:
                 e = exq.get_nowait()
@@ -376,56 +307,45 @@ def generator():
                         exception_check(ex_q)
                         in_q.put(chunk)
 
-                in_q.put(None)
-
             except BrokenPipeError as e: # client disconnect lands here
-                #logger.debug(f"{repr(e)}")
-                logger.info("Client disconnected")
-
-            except asyncio.CancelledError as e:
-                logger.debug(f"{repr(e)}")
-                pass
+                logger.info("Client disconnected - 'Broken pipe'")
 
             except Exception as e:
                 logger.error(f"Exception: {repr(e)}")
                 raise e
+        
+            finally:
+                in_q.put(None) # sentinel
 
-        worker = threading.Thread(target=generator, daemon = True)
-        worker.start()
-
-        worker2 = threading.Thread(target=ffmpeg_io, daemon = True)
-        worker2.start()
-
-        async def audio_out():
-            # out_q -> client
-            while True:
-                try:
-                    audio = out_q.get_nowait()
-                    if audio is None:
-                        return
-                    yield audio
-
-                except queue.Empty:
-                    pass
-
-                except asyncio.CancelledError as e:
-                    logger.debug("{repr(e)}")
-                    ex_q.put(e)
-                    return
-                
-                except Exception as e:
-                    logger.debug("{repr(e)}")
-                    ex_q.put(e)
-                    return
+        def out_writer(): 
+            # in_q -> ffmpeg
+            try:
+                while True:
+                    chunk = in_q.get()
+                    if chunk is None: # sentinel
+                        break
+                    ffmpeg_proc.stdin.write(chunk) # BrokenPipeError from here on client disconnect
+
+            except Exception as e: # BrokenPipeError
+                ex_q.put(e)  # we need to get this exception into the generation loop
+                ffmpeg_proc.kill()
+                return
             
-                await asyncio.sleep(0.01)
+            finally:
+                ffmpeg_proc.stdin.close()
+
+        generator_worker = threading.Thread(target=generator, daemon=True)
+        generator_worker.start()
+
+        out_writer_worker = threading.Thread(target=out_writer, daemon=True)
+        out_writer_worker.start()
 
         def cleanup():
             ffmpeg_proc.kill()
-            del worker
-            del worker2
+            del generator_worker
+            del out_writer_worker
 
-        return StreamingResponse(audio_out(), media_type=media_type, background=cleanup)
+        return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type, background=cleanup)
     else:
         raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
 
@@ -448,6 +368,7 @@ def auto_torch_device():
     parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
     parser.add_argument('--unload-timer', action='store', default=None, type=int, help="Idle unload timer for the XTTS model in seconds, Ex. 900 for 15 minutes")
     parser.add_argument('--use-deepspeed', action='store_true', default=False, help="Use deepspeed with xtts (this option is unsupported)")
+    parser.add_argument('--no-cache-speaker', action='store_true', default=False, help="Don't use the speaker wav embeddings cache")
     parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
     parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
     parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")