wav

Dooy · Oct 9, 2024 · c46473d · c46473d
1 parent 411c807
commit c46473d
Show file tree

Hide file tree

Showing 12 changed files with 1,657 additions and 0 deletions.
diff --git a/package.json b/package.json
@@ -51,6 +51,8 @@
     "vue-waterfall-plugin-next": "^2.3.1"
   },
   "devDependencies": {
+    "@openai/realtime-api-beta": "github:dooy/openai-realtime-api-beta",
+    "@openai/realtime-wavtools": "github:dooy/openai-realtime-wavtools",
     "@antfu/eslint-config": "^0.35.3",
     "@commitlint/cli": "^17.4.4",
     "@commitlint/config-conventional": "^17.4.4",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/lib/wavtools/index.js b/src/lib/wavtools/index.js
@@ -0,0 +1,6 @@
+import { WavPacker } from './lib/wav_packer.js';
+import { AudioAnalysis } from './lib/analysis/audio_analysis.js';
+import { WavStreamPlayer } from './lib/wav_stream_player.js';
+import { WavRecorder } from './lib/wav_recorder.js';
+
+export { AudioAnalysis, WavPacker, WavStreamPlayer, WavRecorder };
diff --git a/src/lib/wavtools/lib/analysis/audio_analysis.js b/src/lib/wavtools/lib/analysis/audio_analysis.js
@@ -0,0 +1,203 @@
+import {
+  noteFrequencies,
+  noteFrequencyLabels,
+  voiceFrequencies,
+  voiceFrequencyLabels,
+} from './constants.js';
+
+/**
+ * Output of AudioAnalysis for the frequency domain of the audio
+ * @typedef {Object} AudioAnalysisOutputType
+ * @property {Float32Array} values Amplitude of this frequency between {0, 1} inclusive
+ * @property {number[]} frequencies Raw frequency bucket values
+ * @property {string[]} labels Labels for the frequency bucket values
+ */
+
+/**
+ * Analyzes audio for visual output
+ * @class
+ */
+export class AudioAnalysis {
+  /**
+   * Retrieves frequency domain data from an AnalyserNode adjusted to a decibel range
+   * returns human-readable formatting and labels
+   * @param {AnalyserNode} analyser
+   * @param {number} sampleRate
+   * @param {Float32Array} [fftResult]
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {AudioAnalysisOutputType}
+   */
+  static getFrequencies(
+    analyser,
+    sampleRate,
+    fftResult,
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30,
+  ) {
+    if (!fftResult) {
+      fftResult = new Float32Array(analyser.frequencyBinCount);
+      analyser.getFloatFrequencyData(fftResult);
+    }
+    const nyquistFrequency = sampleRate / 2;
+    const frequencyStep = (1 / fftResult.length) * nyquistFrequency;
+    let outputValues;
+    let frequencies;
+    let labels;
+    if (analysisType === 'music' || analysisType === 'voice') {
+      const useFrequencies =
+        analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
+      const aggregateOutput = Array(useFrequencies.length).fill(minDecibels);
+      for (let i = 0; i < fftResult.length; i++) {
+        const frequency = i * frequencyStep;
+        const amplitude = fftResult[i];
+        for (let n = useFrequencies.length - 1; n >= 0; n--) {
+          if (frequency > useFrequencies[n]) {
+            aggregateOutput[n] = Math.max(aggregateOutput[n], amplitude);
+            break;
+          }
+        }
+      }
+      outputValues = aggregateOutput;
+      frequencies =
+        analysisType === 'voice' ? voiceFrequencies : noteFrequencies;
+      labels =
+        analysisType === 'voice' ? voiceFrequencyLabels : noteFrequencyLabels;
+    } else {
+      outputValues = Array.from(fftResult);
+      frequencies = outputValues.map((_, i) => frequencyStep * i);
+      labels = frequencies.map((f) => `${f.toFixed(2)} Hz`);
+    }
+    // We normalize to {0, 1}
+    const normalizedOutput = outputValues.map((v) => {
+      return Math.max(
+        0,
+        Math.min((v - minDecibels) / (maxDecibels - minDecibels), 1),
+      );
+    });
+    const values = new Float32Array(normalizedOutput);
+    return {
+      values,
+      frequencies,
+      labels,
+    };
+  }
+
+  /**
+   * Creates a new AudioAnalysis instance for an HTMLAudioElement
+   * @param {HTMLAudioElement} audioElement
+   * @param {AudioBuffer|null} [audioBuffer] If provided, will cache all frequency domain data from the buffer
+   * @returns {AudioAnalysis}
+   */
+  constructor(audioElement, audioBuffer = null) {
+    this.fftResults = [];
+    if (audioBuffer) {
+      /**
+       * Modified from
+       * https://stackoverflow.com/questions/75063715/using-the-web-audio-api-to-analyze-a-song-without-playing
+       *
+       * We do this to populate FFT values for the audio if provided an `audioBuffer`
+       * The reason to do this is that Safari fails when using `createMediaElementSource`
+       * This has a non-zero RAM cost so we only opt-in to run it on Safari, Chrome is better
+       */
+      const { length, sampleRate } = audioBuffer;
+      const offlineAudioContext = new OfflineAudioContext({
+        length,
+        sampleRate,
+      });
+      const source = offlineAudioContext.createBufferSource();
+      source.buffer = audioBuffer;
+      const analyser = offlineAudioContext.createAnalyser();
+      analyser.fftSize = 8192;
+      analyser.smoothingTimeConstant = 0.1;
+      source.connect(analyser);
+      // limit is :: 128 / sampleRate;
+      // but we just want 60fps - cuts ~1s from 6MB to 1MB of RAM
+      const renderQuantumInSeconds = 1 / 60;
+      const durationInSeconds = length / sampleRate;
+      const analyze = (index) => {
+        const suspendTime = renderQuantumInSeconds * index;
+        if (suspendTime < durationInSeconds) {
+          offlineAudioContext.suspend(suspendTime).then(() => {
+            const fftResult = new Float32Array(analyser.frequencyBinCount);
+            analyser.getFloatFrequencyData(fftResult);
+            this.fftResults.push(fftResult);
+            analyze(index + 1);
+          });
+        }
+        if (index === 1) {
+          offlineAudioContext.startRendering();
+        } else {
+          offlineAudioContext.resume();
+        }
+      };
+      source.start(0);
+      analyze(1);
+      this.audio = audioElement;
+      this.context = offlineAudioContext;
+      this.analyser = analyser;
+      this.sampleRate = sampleRate;
+      this.audioBuffer = audioBuffer;
+    } else {
+      const audioContext = new AudioContext();
+      const track = audioContext.createMediaElementSource(audioElement);
+      const analyser = audioContext.createAnalyser();
+      analyser.fftSize = 8192;
+      analyser.smoothingTimeConstant = 0.1;
+      track.connect(analyser);
+      analyser.connect(audioContext.destination);
+      this.audio = audioElement;
+      this.context = audioContext;
+      this.analyser = analyser;
+      this.sampleRate = this.context.sampleRate;
+      this.audioBuffer = null;
+    }
+  }
+
+  /**
+   * Gets the current frequency domain data from the playing audio track
+   * @param {"frequency"|"music"|"voice"} [analysisType]
+   * @param {number} [minDecibels] default -100
+   * @param {number} [maxDecibels] default -30
+   * @returns {AudioAnalysisOutputType}
+   */
+  getFrequencies(
+    analysisType = 'frequency',
+    minDecibels = -100,
+    maxDecibels = -30,
+  ) {
+    let fftResult = null;
+    if (this.audioBuffer && this.fftResults.length) {
+      const pct = this.audio.currentTime / this.audio.duration;
+      const index = Math.min(
+        (pct * this.fftResults.length) | 0,
+        this.fftResults.length - 1,
+      );
+      fftResult = this.fftResults[index];
+    }
+    return AudioAnalysis.getFrequencies(
+      this.analyser,
+      this.sampleRate,
+      fftResult,
+      analysisType,
+      minDecibels,
+      maxDecibels,
+    );
+  }
+
+  /**
+   * Resume the internal AudioContext if it was suspended due to the lack of
+   * user interaction when the AudioAnalysis was instantiated.
+   * @returns {Promise<true>}
+   */
+  async resumeIfSuspended() {
+    if (this.context.state === 'suspended') {
+      await this.context.resume();
+    }
+    return true;
+  }
+}
+
+globalThis.AudioAnalysis = AudioAnalysis;
diff --git a/src/lib/wavtools/lib/analysis/constants.js b/src/lib/wavtools/lib/analysis/constants.js
@@ -0,0 +1,60 @@
+/**
+ * Constants for help with visualization
+ * Helps map frequency ranges from Fast Fourier Transform
+ * to human-interpretable ranges, notably music ranges and
+ * human vocal ranges.
+ */
+
+// Eighth octave frequencies
+const octave8Frequencies = [
+  4186.01, 4434.92, 4698.63, 4978.03, 5274.04, 5587.65, 5919.91, 6271.93,
+  6644.88, 7040.0, 7458.62, 7902.13,
+];
+
+// Labels for each of the above frequencies
+const octave8FrequencyLabels = [
+  'C',
+  'C#',
+  'D',
+  'D#',
+  'E',
+  'F',
+  'F#',
+  'G',
+  'G#',
+  'A',
+  'A#',
+  'B',
+];
+
+/**
+ * All note frequencies from 1st to 8th octave
+ * in format "A#8" (A#, 8th octave)
+ */
+export const noteFrequencies = [];
+export const noteFrequencyLabels = [];
+for (let i = 1; i <= 8; i++) {
+  for (let f = 0; f < octave8Frequencies.length; f++) {
+    const freq = octave8Frequencies[f];
+    noteFrequencies.push(freq / Math.pow(2, 8 - i));
+    noteFrequencyLabels.push(octave8FrequencyLabels[f] + i);
+  }
+}
+
+/**
+ * Subset of the note frequencies between 32 and 2000 Hz
+ * 6 octave range: C1 to B6
+ */
+const voiceFrequencyRange = [32.0, 2000.0];
+export const voiceFrequencies = noteFrequencies.filter((_, i) => {
+  return (
+    noteFrequencies[i] > voiceFrequencyRange[0] &&
+    noteFrequencies[i] < voiceFrequencyRange[1]
+  );
+});
+export const voiceFrequencyLabels = noteFrequencyLabels.filter((_, i) => {
+  return (
+    noteFrequencies[i] > voiceFrequencyRange[0] &&
+    noteFrequencies[i] < voiceFrequencyRange[1]
+  );
+});