ui.html

<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <link rel="stylesheet" href="./index.css" />
    <style>
      * {
        box-sizing: border-box;
        padding: 0;
        margin: 0;
        font-family: sans-serif;
      }

      html,
      body {
        height: 100%;
      }

      body {
        padding: 16px 32px;
      }

      body,
      #container,
      #upload-button {
        display: flex;
        flex-direction: column;
        justify-content: center;
        align-items: center;
      }

      h1 {
        text-align: center;
      }

      #container {
        position: relative;
        width: 480px;
        height: 480px;
        max-width: 100%;
        max-height: 100%;
        border: 2px dashed #d1d5db;
        border-radius: 0.75rem;
        overflow: hidden;
        cursor: pointer;
        margin-top: 1rem;
        background-size: 100% 100%;
        background-size: contain;
        background-position: center;
        background-repeat: no-repeat;
      }

      #mask-output {
        position: absolute;
        pointer-events: none;
        display: none;
      }

      #mask-output-visual {
        position: absolute;
        pointer-events: none;
        opacity: 0.6;
      }

      #upload-button {
        gap: 0.4rem;
        font-size: 18px;
        cursor: pointer;
      }

      #upload {
        display: none;
      }

      svg {
        pointer-events: none;
      }

      #example {
        font-size: 14px;
        text-decoration: underline;
        cursor: pointer;
      }

      #example:hover {
        color: #2563eb;
      }

      #status {
        min-height: 16px;
        margin: 8px 0;
      }

      .icon {
        height: 16px;
        width: 16px;
        position: absolute;
        transform: translate(-50%, -50%);
      }

      #controls > button {
        padding: 6px 12px;
        background-color: #3498db;
        color: white;
        border: 1px solid #2980b9;
        border-radius: 5px;
        cursor: pointer;
        font-size: 16px;
      }

      #controls > button:disabled {
        background-color: #d1d5db;
        color: #6b7280;
        border: 1px solid #9ca3af;
        cursor: not-allowed;
      }

      #information {
        margin-top: 0.25rem;
        font-size: 15px;
      }
    </style>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Transformers.js - Segment Anything</title>
  </head>

  <body>
    <h2>Segment Anything w/ 🤗 Transformers.js</h2>
    <div id="container">
      <label id="upload-button" for="upload">
        <svg
          width="25"
          height="25"
          viewBox="0 0 25 25"
          fill="none"
          xmlns="http://www.w3.org/2000/svg"
        >
          <path
            fill="#000"
            d="M3.5 24.3a3 3 0 0 1-1.9-.8c-.5-.5-.8-1.2-.8-1.9V2.9c0-.7.3-1.3.8-1.9.6-.5 1.2-.7 2-.7h18.6c.7 0 1.3.2 1.9.7.5.6.7 1.2.7 2v18.6c0 .7-.2 1.4-.7 1.9a3 3 0 0 1-2 .8H3.6Zm0-2.7h18.7V2.9H3.5v18.7Zm2.7-2.7h13.3c.3 0 .5 0 .6-.3v-.7l-3.7-5a.6.6 0 0 0-.6-.2c-.2 0-.4 0-.5.3l-3.5 4.6-2.4-3.3a.6.6 0 0 0-.6-.3c-.2 0-.4.1-.5.3l-2.7 3.6c-.1.2-.2.4 0 .7.1.2.3.3.6.3Z"
          ></path>
        </svg>
        Click to upload image
        <label id="example">(or try example)</label>
      </label>
      <!-- this is the canvas for actual image data -->
      <canvas id="mask-output"></canvas>
      <!-- this is the canvas for showing the mask to user -->
      <canvas id="mask-output-visual"></canvas>
    </div>
    <label id="status"></label>
    <div id="controls">
      <button id="reset-image">Reset image</button>
      <button id="clear-points">Clear points</button>
      <button id="cut-mask" disabled>Cut mask</button>
    </div>
    <p id="information">
      Left click = positive points, right click = negative points.
    </p>
    <input id="upload" type="file" accept="image/*" />

    <!-- <script src="index.js" type="module"> -->
    <script type="module" defer>
      async function workerFunction() {
        const { env, SamModel, AutoProcessor, RawImage, Tensor } = await import(
          "https://cdn.jsdelivr.net/npm/@xenova/transformers@2.14.0"
        );
        // Since we will download the model from the Hugging Face Hub, we can skip the local model check
        env.allowLocalModels = false;
        // We adopt the singleton pattern to enable lazy-loading of the model and processor.
        class SegmentAnythingSingleton {
          static model_id = "Xenova/slimsam-77-uniform";
          static model;
          static processor;
          static quantized = true;

          static getInstance() {
            if (!this.model) {
              this.model = SamModel.from_pretrained(this.model_id, {
                quantized: this.quantized,
              });
            }
            if (!this.processor) {
              this.processor = AutoProcessor.from_pretrained(this.model_id);
            }

            return Promise.all([this.model, this.processor]);
          }
        }
        // State variables
        let image_embeddings = null;
        let image_inputs = null;
        let ready = false;

        self.onmessage = async (e) => {
          const [model, processor] =
            await SegmentAnythingSingleton.getInstance();

          if (!ready) {
            // Indicate that we are ready to accept requests
            ready = true;
            self.postMessage({
              type: "ready",
            });
          }

          const { type, data } = e.data;
          if (type === "reset") {
            image_inputs = null;
            image_embeddings = null;
          } else if (type === "segment") {
            // Indicate that we are starting to segment the image
            self.postMessage({
              type: "segment_result",
              data: "start",
            });

            // Read the image and recompute image embeddings
            const image = await RawImage.read(e.data.data);
            image_inputs = await processor(image);
            image_embeddings = await model.get_image_embeddings(image_inputs);

            // Indicate that we have computed the image embeddings, and we are ready to accept decoding requests
            self.postMessage({
              type: "segment_result",
              data: "done",
            });
          } else if (type === "decode") {
            // Prepare inputs for decoding
            const reshaped = image_inputs.reshaped_input_sizes[0];
            const points = data.map((x) => [
              x.point[0] * reshaped[1],
              x.point[1] * reshaped[0],
            ]);
            const labels = data.map((x) => BigInt(x.label));
            const input_points = new Tensor("float32", points.flat(Infinity), [
              1,
              1,
              points.length,
              2,
            ]);
            const input_labels = new Tensor("int64", labels.flat(Infinity), [
              1,
              1,
              labels.length,
            ]);
            // Generate the mask
            const outputs = await model({
              ...image_embeddings,
              input_points,
              input_labels,
            });
            // Post-process the mask
            const masks = await processor.post_process_masks(
              outputs.pred_masks,
              image_inputs.original_sizes,
              image_inputs.reshaped_input_sizes
            );
            // Send the result back to the main thread
            self.postMessage({
              type: "decode_result",
              data: {
                mask: RawImage.fromTensor(masks[0][0]),
                scores: outputs.iou_scores.data,
              },
            });
          } else {
            throw new Error("Unknown message type: " + type);
          }
        };
      }

      // Reference the elements we will use
      const statusLabel = document.getElementById("status");
      const fileUpload = document.getElementById("upload");
      const imageContainer = document.getElementById("container");
      const example = document.getElementById("example");
      const actualMaskCanvas = document.getElementById("mask-output");
      const visualMaskCanvas = document.getElementById("mask-output-visual");
      const uploadButton = document.getElementById("upload-button");
      const resetButton = document.getElementById("reset-image");
      const clearButton = document.getElementById("clear-points");
      const cutButton = document.getElementById("cut-mask");

      // State variables
      let lastPoints = null;
      let isEncoded = false;
      let isDecoding = false;
      let isMultiMaskMode = false;
      let modelReady = false;
      let imageDataURI = null;

      // Constants
      const BASE_URL =
        "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/";
      const EXAMPLE_URL = BASE_URL + "corgi.jpg";

      // Create a string representation of the worker function with env parameter

      // Create a web worker so that the main (UI) thread is not blocked during inference.
      const dataObj = "(" + workerFunction + ")();";
      const blob = new Blob([dataObj.replace('"use strict";', "")]);
      const blobURL = (window.URL ? URL : webkitURL).createObjectURL(blob, {
        type: "application/javascript; charset=utf-8",
      });
      const worker = new Worker(blobURL);

      // Preload star and cross images to avoid lag on first click
      const star = new Image();
      star.src = BASE_URL + "star-icon.png";
      star.className = "icon";

      const cross = new Image();
      cross.src = BASE_URL + "cross-icon.png";
      cross.className = "icon";

      // Set up message handler
      worker.addEventListener("message", (e) => {
        const { type, data } = e.data;
        if (type === "ready") {
          modelReady = true;
          statusLabel.textContent = "Ready";
        } else if (type === "decode_result") {
          isDecoding = false;

          if (!isEncoded) {
            return; // We are not ready to decode yet
          }

          if (!isMultiMaskMode && lastPoints) {
            // Perform decoding with the last point
            decode();
            lastPoints = null;
          }

          const { mask, scores } = data;

          // Update canvas dimensions (if different)
          if (
            actualMaskCanvas.width !== mask.width ||
            actualMaskCanvas.height !== mask.height
          ) {
            actualMaskCanvas.width = mask.width;
            actualMaskCanvas.height = mask.height;
          }

          // console.log(mask, actualMaskCanvas.width, actualMaskCanvas.height);
          // Create context and allocate buffer for pixel data
          const context = actualMaskCanvas.getContext("2d");
          const imageData = context.createImageData(
            actualMaskCanvas.width,
            actualMaskCanvas.height
          );

          // Select best mask
          const numMasks = scores.length; // 3
          let bestIndex = 0;
          for (let i = 1; i < numMasks; ++i) {
            if (scores[i] > scores[bestIndex]) {
              bestIndex = i;
            }
          }
          statusLabel.textContent = `Segment score: ${scores[bestIndex].toFixed(
            2
          )}`;

          // Fill mask with colour
          const pixelData = imageData.data;
          let mockMaskData = [];
          for (let i = 0; i < pixelData.length; ++i) {
            if (mask.data[numMasks * i + bestIndex] === 1) {
              const offset = 4 * i;
              pixelData[offset] = 0; // red
              pixelData[offset + 1] = 114; // green
              pixelData[offset + 2] = 189; // blue
              pixelData[offset + 3] = 255; // alpha
            }
          }

          // Draw image data to context
          context.putImageData(imageData, 0, 0);

          // put actualMaskCanvas onto visualMaskCanvas (maximum 480px long)
          if (mask.width > 480 || mask.height > 480) {
            let rw = mask.width / 480;
            let rh = mask.height / 480;
            if (rw > rh) {
              visualMaskCanvas.width = mask.width / rw;
              visualMaskCanvas.height = mask.height / rw;
            } else {
              visualMaskCanvas.width = mask.width / rh;
              visualMaskCanvas.height = mask.height / rh;
            }
          }
          const visualCanvasCtx = visualMaskCanvas.getContext("2d");
          visualCanvasCtx.drawImage(
            actualMaskCanvas,
            0,
            0,
            visualMaskCanvas.width,
            visualMaskCanvas.height
          );
        } else if (type === "segment_result") {
          if (data === "start") {
            statusLabel.textContent = "Extracting image embedding...";
          } else {
            statusLabel.textContent = "Embedding extracted!";
            isEncoded = true;
          }
        }
      });

      function decode() {
        isDecoding = true;
        worker.postMessage({ type: "decode", data: lastPoints });
      }

      function clearPointsAndMask() {
        // Reset state
        isMultiMaskMode = false;
        lastPoints = null;

        // Remove points from previous mask (if any)
        document.querySelectorAll(".icon").forEach((e) => e.remove());

        // Disable cut button
        cutButton.disabled = true;

        // Reset mask canvas
        actualMaskCanvas
          .getContext("2d")
          .clearRect(0, 0, actualMaskCanvas.width, actualMaskCanvas.height);
        visualMaskCanvas
          .getContext("2d")
          .clearRect(0, 0, visualMaskCanvas.width, visualMaskCanvas.height);
      }
      clearButton.addEventListener("click", clearPointsAndMask);

      resetButton.addEventListener("click", () => {
        // Update state
        isEncoded = false;
        imageDataURI = null;

        // Indicate to worker that we have reset the state
        worker.postMessage({ type: "reset" });

        // Clear points and mask (if present)
        clearPointsAndMask();

        // Update UI
        cutButton.disabled = true;
        imageContainer.style.backgroundImage = "none";
        uploadButton.style.display = "flex";
        statusLabel.textContent = "Ready";
      });

      function segment(data) {
        // Update state
        isEncoded = false;
        if (!modelReady) {
          statusLabel.textContent = "Loading model...";
        }
        imageDataURI = data;

        // Update UI
        imageContainer.style.backgroundImage = `url(${data})`;
        uploadButton.style.display = "none";
        cutButton.disabled = true;

        // Instruct worker to segment the image
        worker.postMessage({ type: "segment", data });
      }

      // Handle file selection
      fileUpload.addEventListener("change", function (e) {
        const file = e.target.files[0];
        if (!file) {
          return;
        }

        const reader = new FileReader();

        // Set up a callback when the file is loaded
        reader.onload = (e2) => segment(e2.target.result);

        reader.readAsDataURL(file);
      });

      example.addEventListener("click", (e) => {
        e.preventDefault();
        segment(EXAMPLE_URL);
      });

      function addIcon({ point, label }) {
        const icon = (label === 1 ? star : cross).cloneNode();
        icon.style.left = `${point[0] * 100}%`;
        icon.style.top = `${point[1] * 100}%`;
        imageContainer.appendChild(icon);
      }

      // Attach hover event to image container
      imageContainer.addEventListener("mousedown", (e) => {
        if (e.button !== 0 && e.button !== 2) {
          return; // Ignore other buttons
        }
        if (!isEncoded) {
          return; // Ignore if not encoded yet
        }
        if (!isMultiMaskMode) {
          lastPoints = [];
          isMultiMaskMode = true;
          cutButton.disabled = false;
        }

        const point = getPoint(e);
        lastPoints.push(point);

        // add icon
        addIcon(point);

        decode();
      });

      // Clamp a value inside a range [min, max]
      function clamp(x, min = 0, max = 1) {
        return Math.max(Math.min(x, max), min);
      }

      function getPoint(e) {
        // Get bounding box
        const bb = imageContainer.getBoundingClientRect();

        // Get the mouse coordinates relative to the container
        const mouseX = clamp((e.clientX - bb.left) / bb.width);
        const mouseY = clamp((e.clientY - bb.top) / bb.height);

        return {
          point: [mouseX, mouseY],
          label:
            e.button === 2 // right click
              ? 0 // negative prompt
              : 1, // positive prompt
        };
      }

      // Do not show context menu on right click
      imageContainer.addEventListener("contextmenu", (e) => {
        e.preventDefault();
      });

      // Attach hover event to image container
      imageContainer.addEventListener("mousemove", (e) => {
        if (!isEncoded || isMultiMaskMode) {
          // Ignore mousemove events if the image is not encoded yet,
          // or we are in multi-mask mode
          return;
        }
        lastPoints = [getPoint(e)];

        if (!isDecoding) {
          decode(); // Only decode if we are not already decoding
        }
      });

      // Handle cut button click
      cutButton.addEventListener("click", () => {
        const base64ToUint8Array = (base64) => {
          var binary_string = window.atob(base64);
          var len = binary_string.length;
          var bytes = new Uint8Array(len);
          for (var i = 0; i < len; i++) {
            bytes[i] = binary_string.charCodeAt(i);
          }
          return new Uint8Array(bytes.buffer);
        };
        const [w, h] = [actualMaskCanvas.width, actualMaskCanvas.height];

        // Get the mask pixel data
        const maskContext = actualMaskCanvas.getContext("2d");
        const maskPixelData = maskContext.getImageData(0, 0, w, h);

        // Load the image
        const image = new Image();
        image.crossOrigin = "anonymous";
        image.onload = async () => {
          // Create a new canvas to hold the image
          const imageCanvas = document.createElement("canvas");
          const imageContext = imageCanvas.getContext("2d");
          imageCanvas.width = w;
          imageCanvas.height = h;
          imageContext.drawImage(image, 0, 0, w, h);
          const imagePixelData = imageContext.getImageData(0, 0, w, h);

          // Create a new canvas to hold the cut-out
          const cutCanvas = document.createElement("canvas");
          const cutContext = cutCanvas.getContext("2d");
          cutCanvas.width = w;
          cutCanvas.height = h;
          const cutPixelData = cutContext.getImageData(0, 0, w, h);

          // Copy the image pixel data to the cut canvas
          for (let i = 3; i < maskPixelData.data.length; i += 4) {
            if (maskPixelData.data[i] > 0) {
              for (let j = 0; j < 4; ++j) {
                const offset = i - j;
                cutPixelData.data[offset] = imagePixelData.data[offset];
              }
            }
          }
          cutContext.putImageData(cutPixelData, 0, 0);

          const data = cutCanvas.toDataURL("image/png", 1);
          const bytes = base64ToUint8Array(data.split(";base64,")[1]);

          parent.postMessage(
            {
              pluginMessage: {
                type: "paste-to-canvas",
                data: {
                  w,
                  h,
                  bytes,
                },
              },
            },
            "*"
          );
        };
        image.src = imageDataURI;
      });
    </script>
  </body>
</html>