From cf9093c8f5ef955d56540d693eb125e621e22b25 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 4 Sep 2020 14:17:56 +0200 Subject: [PATCH] Add JSON API. Also optimise Dockerfile for quick rebuilds. Also add multiple input files support to `predict.py`. --- Dockerfile | 13 +++++++---- README.md | 44 +++++++++++++++++++++++++++++++++++-- predict.py | 57 +++++++++++++++++++++++++++++------------------- predict_api.py | 45 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 ------- 5 files changed, 130 insertions(+), 37 deletions(-) create mode 100644 predict_api.py delete mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile index 7794fc8..a74986c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,17 @@ FROM conda/miniconda3 -RUN apt-get update && \ - apt-get install -y libsndfile1 +RUN apt update && apt install -y g++ + +# Copy requirements.txt and run pip first so that changes to the application +# code do not require a rebuild of the entire image +COPY requirements.txt /app/ +RUN conda update conda && \ + conda install "keras<2.4" "numpy<2" "scikit-learn<0.23" && \ + conda install -c conda-forge librosa theano ADD . /app WORKDIR /app VOLUME /data -RUN pip install --upgrade pip && \ - pip install -r requirements.txt +ENV KERAS_BACKEND=theano diff --git a/README.md b/README.md index 3a750ea..31bbb08 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ This repository provides the [keras](https://keras.io/) model to be used from Py [Docker](https://www.docker.com/) makes it easy to reproduce the results and install all requirements. If you have docker installed, run the following steps to predict a count from the provided test sample. * Build the docker image: `docker build -t countnet .` -* Predict from example: `docker run -i countnet python predict.py --model CRNN examples/5_speakers.wav` +* Run like this: `docker run -it countnet python predict.py ...` (see usage details below) +* Mount your data into the container: `docker run -v /path/to/your/data:/data -it countnet python predict.py ... /data/your_audio.wav` ### Manual Installation @@ -49,7 +50,46 @@ To install the requirements using Anaconda Python, run You can now run the command line script and process wav files using the pre-trained model `CRNN` (best peformance). -`python predict.py examples/5_speakers.wav --model CRNN` +``` +python predict.py --model CRNN examples/5_speakers.wav +# => Speaker Count Estimate: examples/5_speakers.wav 5 +``` + +You can also pass multiple files at once. + +``` +python predict.py --model CRNN examples/5_speakers.wav examples/5_speakers.wav +# => Speaker Count Estimate: examples/5_speakers.wav 5 +# => Speaker Count Estimate: examples/5_speakers.wav 5 +``` + +There is also a simple JSON API to send audio data to (not production ready; only for development!). To run the server: + +``` +python predict_api.py --model CRNN + +# With Docker: +docker run -p5000:5000 -it countnet python predict_api.py --model CRNN +``` + +The server expects a JSON list of base64 encoded arrays of 16 kHz, float32 audio arrays. It returns a JSON list of integers. If estimation failed for any of the arrays, its result is set to `null` instead. + +```py +import base64 +import requests +import librosa + +audio_data1 = librosa.core.load("/path/to/5_speakers.wav", sr=16000, dtype="float32")[0] +response = requests.post( + "http://localhost:5000", + json=[ + base64.b64encode(audio_data1.tobytes()) + ] +) +print(response.json()) +# => [5] +``` + ## Reproduce Paper Results using the LibriCount Dataset [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1216072.svg)](https://doi.org/10.5281/zenodo.1216072) diff --git a/predict.py b/predict.py index c04e811..ad853e6 100644 --- a/predict.py +++ b/predict.py @@ -1,5 +1,4 @@ import numpy as np -import soundfile as sf import argparse import os import keras @@ -20,6 +19,25 @@ def class_mae(y_true, y_pred): ) +def load_scaler(): + scaler = sklearn.preprocessing.StandardScaler() + with np.load(os.path.join("models", 'scaler.npz')) as data: + scaler.mean_ = data['arr_0'] + scaler.scale_ = data['arr_1'] + return scaler + + +def load_model(model_name): + path = os.path.join('models', model_name + '.h5') + return keras.models.load_model( + path, + custom_objects={ + 'class_mae': class_mae, + 'exp': K.exp + } + ) + + def count(audio, model, scaler): # compute STFT X = np.abs(librosa.stft(audio, n_fft=400, hop_length=160)).T @@ -51,7 +69,8 @@ def count(audio, model, scaler): parser.add_argument( 'audio', - help='audio file (samplerate 16 kHz) of 5 seconds duration' + help='audio file (samplerate 16 kHz) of 5 seconds duration', + nargs='+', ) parser.add_argument( @@ -59,30 +78,22 @@ def count(audio, model, scaler): help='model name' ) + parser.add_argument('--print-summary', action='store_true') + args = parser.parse_args() # load model - model = keras.models.load_model( - os.path.join('models', args.model + '.h5'), - custom_objects={ - 'class_mae': class_mae, - 'exp': K.exp - } - ) + model = load_model(args.model) - # print model configuration - model.summary() - # save as svg file - # load standardisation parameters - scaler = sklearn.preprocessing.StandardScaler() - with np.load(os.path.join("models", 'scaler.npz')) as data: - scaler.mean_ = data['arr_0'] - scaler.scale_ = data['arr_1'] + if args.print_summary: + # print model configuration + model.summary() - # compute audio - audio, rate = sf.read(args.audio, always_2d=True) + # load standardisation parameters + scaler = load_scaler() - # downmix to mono - audio = np.mean(audio, axis=1) - estimate = count(audio, model, scaler) - print("Speaker Count Estimate: ", estimate) + for f in args.audio: + # compute audio + audio = librosa.load(f, sr=16000)[0] + estimate = count(audio, model, scaler) + print("Speaker Count Estimate:", f, estimate) diff --git a/predict_api.py b/predict_api.py new file mode 100644 index 0000000..f7cd9e6 --- /dev/null +++ b/predict_api.py @@ -0,0 +1,45 @@ +import base64 +import json +import numpy as np +from werkzeug.wrappers import Request, Response +import predict + + +def decode_audio(audio_bytes): + return np.frombuffer(base64.b64decode(audio_bytes), dtype="float32") + + +def make_app(estimate_func): + def app(environ, start_response): + inputs = json.loads(Request(environ).get_data()) + + outputs = [] + for inp in inputs: + try: + est = int(estimate_func(decode_audio(inp))) + except Exception as e: + print(f"Error estimating speaker count for input {len(outputs)}: {e}") + est = None + outputs.append(est) + + return Response(json.dumps(outputs))(environ, start_response) + + return app + + +if __name__ == "__main__": + import argparse + import functools + from werkzeug.serving import run_simple + + parser = argparse.ArgumentParser( + description="Run simple JSON api server to predict speaker count" + ) + parser.add_argument("--model", default="CRNN", help="model name") + args = parser.parse_args() + + model = predict.load_model(args.model) + scaler = predict.load_scaler() + + app = make_app(functools.partial(predict.count, model=model, scaler=scaler)) + run_simple("0.0.0.0", 5000, app, use_debugger=True) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d51daf5..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -audioread -h5py -keras<2.4 -librosa -numpy<2 -scikit-learn<0.23 -sklearn==0.0 -tensorflow<2