اجرای Deepseek Janus-Pro-1B در مرورگر: یک راهنمای جامع

ek3nk4r 2025-01-28

0 23 خواندن این مطلب 4 دقیقه زمان میبرد

اجرای Deepseek Janus-Pro-1B در مرورگر: یک راهنمای جامع

پیشنهاد ویژه

خرید فالوور واقعی خرید لایک اینستاگرام خرید ویو اینستاگرام خرید فالوور اینستاگرام

امکان اجرای مدل های بزرگ زبان (LLMS) به طور مستقیم در مرورگر امکانات جدیدی را برای برنامه های کاربردی AI در کنار مشتری حفظ کرده است. در این پست وبلاگ ، نحوه اجرای آن را کشف خواهیم کرد Deepseek Janus-Pro-1B، یک مدل قدرتمند تولید متن به تصویر ، کاملاً در مرورگر با استفاده از WebGPU و بغل کردن ترانسفورماتور Face.js.

فهرست مطالب

چرا استنباط مبتنی بر مرورگر؟

خلوت: داده ها هرگز دستگاه کاربر را ترک نمی کنند.
راندمان هزینه: هیچ زیرساخت سرور لازم نیست.
قابلیت دسترسی: در هر دستگاهی با مرورگر مدرن و پشتیبانی WebGPU اجرا می شود.

Deepseek Janus-Pro-1B ، طراحی شده برای کارهای چند مدلی مانند تولید متن به تصویر ، اکنون به لطف بهینه سازی در استنتاج مبتنی بر مرورگر در دسترس است Transform.js وت شتاب WebGPUبشر

ابزار و کتابخانه های کلیدی

Transform.js: یک بندر JavaScript از کتابخانه ترانسفورماتور Face Husging Face ، که برای اجرای مرورگر بهینه شده است.
webgpu: یک API مدرن برای شتاب GPU در مرورگرها ، جایگزین کردن WebGL با عملکرد بهبود یافته برای بار کاری ML.
زمان اجرا Onnx: اجرای مدل را از طریق نمودارهای محاسبه بهینه شده امکان پذیر می کند.

کد نسخه ی نمایشی

مثال زیر نحوه بارگیری و اجرای Deepseek Janus-Pro-1B را در یک کارگر وب برای استنباط غیر مسدود کننده نشان می دهد. کد کامل در مخزن GitHub موجود است.

import {
  AutoProcessor,
  MultiModalityCausalLM,
  BaseStreamer,
  TextStreamer,
  InterruptableStoppingCriteria,
} from "@huggingface/transformers";

// Define constants
const IMAGE_GENERATION_COMMAND_PREFIX = "/imagine ";
const MAX_NEW_TEXT_TOKENS = 1024;

/**
 * Helper function to perform WebGPU feature detection
 */
let fp16_supported = false;
async function check() {
  try {
    const adapter = await navigator.gpu.requestAdapter();
    if (!adapter) {
      throw new Error("WebGPU is not supported (no adapter found)");
    }
    fp16_supported = adapter.features.has("shader-f16");
    self.postMessage({
      status: "success",
      data: fp16_supported,
    });
  } catch (e) {
    self.postMessage({
      status: "error",
      data: e.toString(),
    });
  }
}

/**
 * This class uses the Singleton pattern to enable lazy-loading of the pipeline
 */
class ImageGenerationPipeline {
  static model_id = "onnx-community/Janus-Pro-1B-ONNX";

  static async getInstance(progress_callback = null) {
    this.processor ??= AutoProcessor.from_pretrained(this.model_id, {
      progress_callback,
    });

    this.model ??= MultiModalityCausalLM.from_pretrained(this.model_id, {
      dtype: fp16_supported
        ? {
            prepare_inputs_embeds: "q4",
            language_model: "q4f16",
            lm_head: "fp16",
            gen_head: "fp16",
            gen_img_embeds: "fp16",
            image_decode: "fp32",
          }
        : {
            prepare_inputs_embeds: "fp32",
            language_model: "q4",
            lm_head: "fp32",
            gen_head: "fp32",
            gen_img_embeds: "fp32",
            image_decode: "fp32",
          },
      device: {
        prepare_inputs_embeds: "wasm", // TODO use "webgpu" when bug is fixed
        language_model: "webgpu",
        lm_head: "webgpu",
        gen_head: "webgpu",
        gen_img_embeds: "webgpu",
        image_decode: "webgpu",
      },
      progress_callback,
    });

    return Promise.all([this.processor, this.model]);
  }
}

class ProgressStreamer extends BaseStreamer {
  constructor(total, on_progress) {
    super();
    this.total = total;
    this.on_progress = on_progress;

    this.count = null;
    this.start_time = null;
  }

  put(value) {
    if (this.count === null) {
      // Ignore the first batch of tokens (prompt)
      this.count = 0;
      this.start_time = performance.now();
      return;
    }

    const progress = ++this.count / this.total;

    this.on_progress({
      count: this.count,
      total: this.total,
      progress,
      time: performance.now() - this.start_time,
    });
  }

  end() {
    /* no nothing */
  }
}

const stopping_criteria = new InterruptableStoppingCriteria();

async function generate(messages) {
  // For this demo, we only respond to the last message
  const message = messages.at(-1);

  // Tell the main thread we are starting
  self.postMessage({ status: "start" });

  // Load the pipeline
  const [processor, model] = await ImageGenerationPipeline.getInstance();

  // Determine if the user wants to generate an image or text
  if (message.content.startsWith(IMAGE_GENERATION_COMMAND_PREFIX)) {
    const text = message.content.replace(IMAGE_GENERATION_COMMAND_PREFIX, "");

    const conversation = [
      {
        role: "<|User|>", // uses title case
        content: text,
      },
    ];
    const inputs = await processor(conversation, {
      chat_template: "text_to_image",
    });

    const callback_function = (output) => {
      self.postMessage({
        status: "image-update",
        ...output,
      });
    };

    const num_image_tokens = processor.num_image_tokens;
    const streamer = new ProgressStreamer(num_image_tokens, callback_function);

    const outputs = await model.generate_images({
      ...inputs,
      min_new_tokens: num_image_tokens,
      max_new_tokens: num_image_tokens,
      do_sample: true,
      streamer,
    });

    const blob = await outputs[0].toBlob();

    // Send the output back to the main thread
    self.postMessage({
      status: "image-update",
      blob,
    });
  } else {
    const inputs = await processor(
      message.image
        ? [
            {
              role: "<|User|>",
              content: "\n" + message.content,
              images: [message.image],
            },
          ]
        : [
            {
              role: "<|System|>",
              content:
                "You are a helpful assistant. Answer the user's questions in a concise manner.",
            },
            {
              role: "<|User|>",
              content: message.content,
            },
          ],
    );

    let startTime;
    let numTokens = 0;
    let tps;
    const token_callback_function = () => {
      startTime ??= performance.now();

      if (numTokens++ > 0) {
        tps = (numTokens / (performance.now() - startTime)) * 1000;
      }
    };
    const callback_function = (output) => {
      self.postMessage({
        status: "text-update",
        output,
        tps,
        numTokens,
      });
    };

    const streamer = new TextStreamer(processor.tokenizer, {
      skip_prompt: true,
      skip_special_tokens: true,
      callback_function,
      token_callback_function,
    });

    // Generate response
    const outputs = await model.generate({
      ...inputs,
      max_new_tokens: MAX_NEW_TEXT_TOKENS,
      do_sample: false,
      streamer,
      stopping_criteria,
    });
  }

  // Tell the main thread we are done
  self.postMessage({
    status: "complete",
  });
}

async function load() {
  self.postMessage({
    status: "loading",
    data: "Loading model...",
  });

  // Load the pipeline and save it for future use.
  const [processor, model] = await ImageGenerationPipeline.getInstance((x) => {
    // We also add a progress callback to the pipeline so that we can
    // track model loading.
    self.postMessage(x);
  });

  self.postMessage({ status: "ready" });
}

// Listen for messages from the main thread
self.addEventListener("message", async (e) => {
  const { type, data } = e.data;

  switch (type) {
    case "check":
      check();
      break;

    case "load":
      load();
      break;

    case "generate":
      stopping_criteria.reset();
      generate(data);
      break;

    case "interrupt":
      stopping_criteria.interrupt();
      break;

    case "reset":
      stopping_criteria.reset();
      break;
  }
});

اجرای نسخه ی نمایشی

نسخه ی نمایشی زنده را در اینجا ببینید: نسخه ی نمایشی مرورگر Deepseek Janus-Pro-1Bبشر

ویژگی های کلیدی نسخه ی نمایشی:

به روزرسانی های پیشرفت در زمان واقعی در هنگام بارگیری و استنباط مدل.
نسل شیار WebGPU (به Chrome 113+ یا Edge 113+ نیاز دارد).
اجرای کامل طرف مشتری-هیچ داده ای به سرورهای خارجی ارسال نمی شود.

چالش ها و بهینه سازی ها

اندازه گیری مدل: این مدل برای کاهش اندازه و بهبود سرعت بارگذاری به 8 بیتی اندازه گیری می شود.
مدیریت حافظه: کارگران وب از انجماد UI در هنگام استنباط جلوگیری می کنند.
سازگاری مرورگر: WebGPU هنوز هم آزمایشی است اما برای عملکرد بسیار مهم است.

پایان

اجرای Deepseek Janus-Pro-1B در مرورگر پتانسیل هوش مصنوعی طرف مشتری را به نمایش می گذارد. با داشتن ابزارهایی مانند Transformers.js و WebGPU ، مدل های پیچیده هم اکنون می توانند ضمن حفظ حریم شخصی کاربر ، در محیط های محدود کار کنند.

مراحل بعدی: