Initial model release

Browse files

Files changed (6) hide show

README.md +53 -0
clapProcessor.js +158 -0
example-usage.html +30 -0
localClassifier.js +205 -0
model-config.json +67 -0
userFeedbackStore.js +213 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+license: mit
+base_model: Xenova/clap-htsat-unfused
+tags:
+- audio-classification
+- transformers.js
+- clap
+- audio-tagging
+library_name: transformers.js
+---
+# clip-tagger Model
+This is a personalized audio tagging model based on CLAP (Contrastive Language-Audio Pre-training). It extends the base Xenova/clap-htsat-unfused model with user feedback and custom tags.
+## Model Description
+- **Base Model**: [Xenova/clap-htsat-unfused](https://huggingface.co/Xenova/clap-htsat-unfused)
+- **Framework**: Transformers.js compatible
+- **Training**: User feedback and custom tag integration
+- **Use Case**: Personalized audio content tagging
+## Usage
+```javascript
+import { CLAPProcessor } from './clapProcessor.js';
+import { LocalClassifier } from './localClassifier.js';
+// Load the model
+const processor = new CLAPProcessor();
+const classifier = new LocalClassifier();
+classifier.loadModel(); // Loads from localStorage or model files
+// Process audio
+const tags = await processor.processAudio(audioBuffer);
+const personalizedTags = classifier.predictAll(features, candidateTags);
+```
+## Files
+- `localClassifier.js` - Local classifier implementation
+- `clapProcessor.js` - CLAP model wrapper
+- `userFeedbackStore.js` - User feedback storage system
+- `model-config.json` - Model configuration
+- `example-usage.html` - Usage example
+## Demo
+Try the live demo: [clip-tagger Space](https://huggingface.co/spaces/sohei1l/clip-tagger)
+## Training Data
+This model learns from user corrections and custom tags. The base CLAP model provides initial audio understanding, while the local classifier adapts to user preferences.

clapProcessor.js ADDED Viewed

	@@ -0,0 +1,158 @@

+import { pipeline, AutoProcessor, ClapAudioModelWithProjection } from '@xenova/transformers';
+class CLAPProcessor {
+  constructor() {
+    this.model = null;
+    this.processor = null;
+    this.defaultLabels = [
+      'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin',
+      'trumpet', 'saxophone', 'flute', 'classical music', 'rock music',
+      'pop music', 'jazz', 'electronic music', 'ambient', 'nature sounds',
+      'rain', 'wind', 'ocean waves', 'birds chirping', 'dog barking',
+      'cat meowing', 'car engine', 'traffic', 'footsteps', 'door closing',
+      'applause', 'laughter', 'crying', 'coughing', 'sneezing',
+      'telephone ringing', 'alarm clock', 'typing', 'water running',
+      'fire crackling', 'thunder', 'helicopter', 'airplane', 'train',
+      'motorcycle', 'bell ringing', 'whistle', 'horn', 'siren',
+      'explosion', 'gunshot', 'silence', 'noise', 'distortion'
+    ];
+  }
+  async initialize() {
+    if (this.model && this.processor) return;
+    try {
+      // Load the CLAP model and processor
+      this.processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
+      this.model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
+      console.log('CLAP model loaded successfully');
+    } catch (error) {
+      console.error('Failed to load CLAP model:', error);
+      throw error;
+    }
+  }
+  async processAudio(audioBuffer) {
+    if (!this.model || !this.processor) {
+      await this.initialize();
+    }
+    try {
+      // Convert audio to the format expected by CLAP
+      const audio = await this.preprocessAudio(audioBuffer);
+      // Process audio through the model
+      const audioInputs = await this.processor(audio);
+      const audioFeatures = await this.model.get_audio_features(audioInputs);
+      // Process text labels
+      const textInputs = await this.processor.text(this.defaultLabels);
+      const textFeatures = await this.model.get_text_features(textInputs);
+      // Calculate similarities
+      const similarities = await this.calculateSimilarities(audioFeatures, textFeatures);
+      // Return top tags with confidence scores
+      return this.getTopTags(similarities, 5);
+    } catch (error) {
+      console.error('Error processing audio:', error);
+      throw error;
+    }
+  }
+  async preprocessAudio(audioBuffer) {
+    // Convert to mono if stereo
+    let audioData;
+    if (audioBuffer.numberOfChannels > 1) {
+      audioData = new Float32Array(audioBuffer.length);
+      for (let i = 0; i < audioBuffer.length; i++) {
+        let sum = 0;
+        for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) {
+          sum += audioBuffer.getChannelData(channel)[i];
+        }
+        audioData[i] = sum / audioBuffer.numberOfChannels;
+      }
+    } else {
+      audioData = audioBuffer.getChannelData(0);
+    }
+    // Resample to 48kHz if needed (CLAP expects 48kHz)
+    const targetSampleRate = 48000;
+    if (audioBuffer.sampleRate !== targetSampleRate) {
+      audioData = await this.resampleAudio(audioData, audioBuffer.sampleRate, targetSampleRate);
+    }
+    return audioData;
+  }
+  async resampleAudio(audioData, originalRate, targetRate) {
+    // Simple linear interpolation resampling
+    const ratio = originalRate / targetRate;
+    const newLength = Math.round(audioData.length / ratio);
+    const resampled = new Float32Array(newLength);
+    for (let i = 0; i < newLength; i++) {
+      const originalIndex = i * ratio;
+      const indexFloor = Math.floor(originalIndex);
+      const indexCeil = Math.min(indexFloor + 1, audioData.length - 1);
+      const fraction = originalIndex - indexFloor;
+      resampled[i] = audioData[indexFloor] * (1 - fraction) + audioData[indexCeil] * fraction;
+    }
+    return resampled;
+  }
+  async calculateSimilarities(audioFeatures, textFeatures) {
+    // Calculate cosine similarity between audio and text features
+    const audioVector = audioFeatures.data;
+    const similarities = [];
+    for (let i = 0; i < this.defaultLabels.length; i++) {
+      const textVector = textFeatures.data.slice(
+        i * audioVector.length,
+        (i + 1) * audioVector.length
+      );
+      const similarity = this.cosineSimilarity(audioVector, textVector);
+      similarities.push(similarity);
+    }
+    return similarities;
+  }
+  cosineSimilarity(vecA, vecB) {
+    let dotProduct = 0;
+    let normA = 0;
+    let normB = 0;
+    for (let i = 0; i < vecA.length; i++) {
+      dotProduct += vecA[i] * vecB[i];
+      normA += vecA[i] * vecA[i];
+      normB += vecB[i] * vecB[i];
+    }
+    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+  }
+  getTopTags(similarities, topK = 5) {
+    const tagged = this.defaultLabels.map((label, index) => ({
+      label,
+      confidence: Math.max(0, similarities[index]) // Ensure non-negative
+    }));
+    return tagged
+      .sort((a, b) => b.confidence - a.confidence)
+      .slice(0, topK);
+  }
+  // Convert file to AudioBuffer
+  async fileToAudioBuffer(file) {
+    const arrayBuffer = await file.arrayBuffer();
+    const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+    return await audioContext.decodeAudioData(arrayBuffer);
+  }
+}
+export default CLAPProcessor;

example-usage.html ADDED Viewed

	@@ -0,0 +1,30 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>clip-tagger Model Usage Example</title>
+    <script type="module">
+        import { CLAPProcessor } from './clapProcessor.js';
+        import { LocalClassifier } from './localClassifier.js';
+        async function loadModel() {
+            const processor = new CLAPProcessor();
+            const classifier = new LocalClassifier();
+            // Initialize
+            await processor.initialize();
+            classifier.loadModel();
+            console.log('Model loaded successfully!');
+            console.log('Model stats:', classifier.getModelStats());
+        }
+        // Load when page loads
+        loadModel();
+    </script>
+</head>
+<body>
+    <h1>clip-tagger Model</h1>
+    <p>Check the browser console for model loading status.</p>
+    <p>See the full demo at: <a href="https://huggingface.co/spaces/sohei1l/clip-tagger">clip-tagger Space</a></p>
+</body>
+</html>

localClassifier.js ADDED Viewed

	@@ -0,0 +1,205 @@

+class LocalClassifier {
+  constructor() {
+    this.weights = new Map(); // tag -> weight vector
+    this.biases = new Map(); // tag -> bias
+    this.learningRate = 0.01;
+    this.featureDim = 512; // CLAP embedding dimension
+    this.isInitialized = false;
+  }
+  initialize(featureDim = 512) {
+    this.featureDim = featureDim;
+    this.isInitialized = true;
+  }
+  // Simple logistic regression training
+  trainOnFeedback(features, tag, feedback) {
+    if (!this.isInitialized) {
+      this.initialize();
+    }
+    // Convert feedback to target value
+    let target;
+    switch (feedback) {
+      case 'positive':
+        target = 1.0;
+        break;
+      case 'negative':
+        target = 0.0;
+        break;
+      case 'custom':
+        target = 1.0;
+        break;
+      default:
+        return; // Skip unknown feedback
+    }
+    // Initialize weights for new tag
+    if (!this.weights.has(tag)) {
+      this.weights.set(tag, new Array(this.featureDim).fill(0).map(() =>
+        (Math.random() - 0.5) * 0.01
+      ));
+      this.biases.set(tag, 0);
+    }
+    const weights = this.weights.get(tag);
+    const bias = this.biases.get(tag);
+    // Forward pass
+    let logit = bias;
+    for (let i = 0; i < features.length; i++) {
+      logit += weights[i] * features[i];
+    }
+    // Sigmoid activation
+    const prediction = 1 / (1 + Math.exp(-logit));
+    // Compute gradient
+    const error = prediction - target;
+    // Update weights and bias
+    for (let i = 0; i < features.length; i++) {
+      weights[i] -= this.learningRate * error * features[i];
+    }
+    this.biases.set(tag, bias - this.learningRate * error);
+    // Store updated weights
+    this.weights.set(tag, weights);
+  }
+  // Predict confidence for a tag given features
+  predict(features, tag) {
+    if (!this.weights.has(tag)) {
+      return null; // No training data for this tag
+    }
+    const weights = this.weights.get(tag);
+    const bias = this.biases.get(tag);
+    let logit = bias;
+    for (let i = 0; i < Math.min(features.length, weights.length); i++) {
+      logit += weights[i] * features[i];
+    }
+    // Sigmoid activation
+    return 1 / (1 + Math.exp(-logit));
+  }
+  // Get all predictions for given features
+  predictAll(features, candidateTags) {
+    const predictions = [];
+    for (const tag of candidateTags) {
+      const confidence = this.predict(features, tag);
+      if (confidence !== null) {
+        predictions.push({ tag, confidence });
+      }
+    }
+    return predictions.sort((a, b) => b.confidence - a.confidence);
+  }
+  // Retrain on batch of feedback data
+  retrainOnBatch(feedbackData) {
+    for (const item of feedbackData) {
+      if (item.audioFeatures && item.correctedTags) {
+        // Create simple features from audio metadata
+        const features = this.extractSimpleFeatures(item.audioFeatures);
+        // Train on corrected tags
+        for (const tagData of item.correctedTags) {
+          this.trainOnFeedback(features, tagData.tag, tagData.feedback);
+        }
+      }
+    }
+  }
+  // Extract simple features from audio metadata
+  extractSimpleFeatures(audioFeatures) {
+    // Create a simple feature vector from audio metadata
+    // In a real implementation, this would use actual CLAP embeddings
+    const features = new Array(this.featureDim).fill(0);
+    if (audioFeatures) {
+      // Use basic audio properties to create pseudo-features
+      features[0] = audioFeatures.duration / 60; // Duration in minutes
+      features[1] = audioFeatures.sampleRate / 48000; // Normalized sample rate
+      features[2] = audioFeatures.numberOfChannels; // Number of channels
+      // Fill remaining with small random values based on hash of properties
+      const seed = this.simpleHash(JSON.stringify(audioFeatures));
+      for (let i = 3; i < this.featureDim; i++) {
+        features[i] = this.seededRandom(seed + i) * 0.1;
+      }
+    }
+    return features;
+  }
+  // Simple hash function for seeded random
+  simpleHash(str) {
+    let hash = 0;
+    for (let i = 0; i < str.length; i++) {
+      const char = str.charCodeAt(i);
+      hash = ((hash << 5) - hash) + char;
+      hash = hash & hash; // Convert to 32-bit integer
+    }
+    return Math.abs(hash);
+  }
+  // Seeded random number generator
+  seededRandom(seed) {
+    const x = Math.sin(seed) * 10000;
+    return x - Math.floor(x);
+  }
+  // Save model to localStorage
+  saveModel() {
+    const modelData = {
+      weights: Object.fromEntries(this.weights),
+      biases: Object.fromEntries(this.biases),
+      featureDim: this.featureDim,
+      learningRate: this.learningRate
+    };
+    localStorage.setItem('clipTaggerModel', JSON.stringify(modelData));
+  }
+  // Load model from localStorage
+  loadModel() {
+    const saved = localStorage.getItem('clipTaggerModel');
+    if (saved) {
+      try {
+        const modelData = JSON.parse(saved);
+        this.weights = new Map(Object.entries(modelData.weights));
+        this.biases = new Map(Object.entries(modelData.biases));
+        this.featureDim = modelData.featureDim || 512;
+        this.learningRate = modelData.learningRate || 0.01;
+        this.isInitialized = true;
+        return true;
+      } catch (error) {
+        console.error('Error loading model:', error);
+      }
+    }
+    return false;
+  }
+  // Get model statistics
+  getModelStats() {
+    return {
+      trainedTags: this.weights.size,
+      featureDim: this.featureDim,
+      learningRate: this.learningRate,
+      tags: Array.from(this.weights.keys())
+    };
+  }
+  // Clear the model
+  clearModel() {
+    this.weights.clear();
+    this.biases.clear();
+    localStorage.removeItem('clipTaggerModel');
+  }
+}
+export default LocalClassifier;

model-config.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "model_type": "clip-tagger",
+  "base_model": "Xenova/clap-htsat-unfused",
+  "version": "1.0.0",
+  "framework": "transformers.js",
+  "feature_dim": 512,
+  "learning_rate": 0.01,
+  "supported_formats": [
+    "wav",
+    "mp3",
+    "m4a",
+    "ogg"
+  ],
+  "default_labels": [
+    "speech",
+    "music",
+    "singing",
+    "guitar",
+    "piano",
+    "drums",
+    "violin",
+    "trumpet",
+    "saxophone",
+    "flute",
+    "classical music",
+    "rock music",
+    "pop music",
+    "jazz",
+    "electronic music",
+    "ambient",
+    "nature sounds",
+    "rain",
+    "wind",
+    "ocean waves",
+    "birds chirping",
+    "dog barking",
+    "cat meowing",
+    "car engine",
+    "traffic",
+    "footsteps",
+    "door closing",
+    "applause",
+    "laughter",
+    "crying",
+    "coughing",
+    "sneezing",
+    "telephone ringing",
+    "alarm clock",
+    "typing",
+    "water running",
+    "fire crackling",
+    "thunder",
+    "helicopter",
+    "airplane",
+    "train",
+    "motorcycle",
+    "bell ringing",
+    "whistle",
+    "horn",
+    "siren",
+    "explosion",
+    "gunshot",
+    "silence",
+    "noise",
+    "distortion"
+  ]
+}

userFeedbackStore.js ADDED Viewed

	@@ -0,0 +1,213 @@

+class UserFeedbackStore {
+  constructor() {
+    this.dbName = 'ClipTaggerDB';
+    this.version = 1;
+    this.db = null;
+  }
+  async initialize() {
+    return new Promise((resolve, reject) => {
+      const request = indexedDB.open(this.dbName, this.version);
+      request.onerror = () => reject(request.error);
+      request.onsuccess = () => {
+        this.db = request.result;
+        resolve();
+      };
+      request.onupgradeneeded = (event) => {
+        const db = event.target.result;
+        // Create object stores
+        if (!db.objectStoreNames.contains('audioFeedback')) {
+          const audioStore = db.createObjectStore('audioFeedback', {
+            keyPath: 'id',
+            autoIncrement: true
+          });
+          audioStore.createIndex('timestamp', 'timestamp', { unique: false });
+        }
+        if (!db.objectStoreNames.contains('tagFeedback')) {
+          const tagStore = db.createObjectStore('tagFeedback', {
+            keyPath: 'id',
+            autoIncrement: true
+          });
+          tagStore.createIndex('tag', 'tag', { unique: false });
+          tagStore.createIndex('timestamp', 'timestamp', { unique: false });
+        }
+        if (!db.objectStoreNames.contains('customTags')) {
+          const customTagStore = db.createObjectStore('customTags', {
+            keyPath: 'tag'
+          });
+          customTagStore.createIndex('usage', 'usage', { unique: false });
+        }
+      };
+    });
+  }
+  async saveAudioFeedback(audioHash, originalTags, correctedTags, audioFeatures) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['audioFeedback'], 'readwrite');
+    const store = transaction.objectStore('audioFeedback');
+    const feedback = {
+      audioHash,
+      originalTags,
+      correctedTags,
+      audioFeatures,
+      timestamp: Date.now()
+    };
+    return new Promise((resolve, reject) => {
+      const request = store.add(feedback);
+      request.onsuccess = () => resolve(request.result);
+      request.onerror = () => reject(request.error);
+    });
+  }
+  async saveTagFeedback(tag, feedback, audioHash) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['tagFeedback'], 'readwrite');
+    const store = transaction.objectStore('tagFeedback');
+    const tagFeedback = {
+      tag,
+      feedback, // 'positive', 'negative', or 'custom'
+      audioHash,
+      timestamp: Date.now()
+    };
+    return new Promise((resolve, reject) => {
+      const request = store.add(tagFeedback);
+      request.onsuccess = () => resolve(request.result);
+      request.onerror = () => reject(request.error);
+    });
+  }
+  async saveCustomTag(tag) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['customTags'], 'readwrite');
+    const store = transaction.objectStore('customTags');
+    return new Promise((resolve, reject) => {
+      const getRequest = store.get(tag);
+      getRequest.onsuccess = () => {
+        const existing = getRequest.result;
+        const tagData = existing ?
+          { ...existing, usage: existing.usage + 1 } :
+          { tag, usage: 1, timestamp: Date.now() };
+        const putRequest = store.put(tagData);
+        putRequest.onsuccess = () => resolve(putRequest.result);
+        putRequest.onerror = () => reject(putRequest.error);
+      };
+      getRequest.onerror = () => reject(getRequest.error);
+    });
+  }
+  async getCustomTags(limit = 20) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['customTags'], 'readonly');
+    const store = transaction.objectStore('customTags');
+    const index = store.index('usage');
+    return new Promise((resolve, reject) => {
+      const request = index.openCursor(null, 'prev'); // Descending order
+      const results = [];
+      request.onsuccess = (event) => {
+        const cursor = event.target.result;
+        if (cursor && results.length < limit) {
+          results.push(cursor.value);
+          cursor.continue();
+        } else {
+          resolve(results);
+        }
+      };
+      request.onerror = () => reject(request.error);
+    });
+  }
+  async getTagFeedback(tag = null) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['tagFeedback'], 'readonly');
+    const store = transaction.objectStore('tagFeedback');
+    return new Promise((resolve, reject) => {
+      let request;
+      if (tag) {
+        const index = store.index('tag');
+        request = index.getAll(tag);
+      } else {
+        request = store.getAll();
+      }
+      request.onsuccess = () => resolve(request.result);
+      request.onerror = () => reject(request.error);
+    });
+  }
+  async getAudioFeedback(limit = 100) {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['audioFeedback'], 'readonly');
+    const store = transaction.objectStore('audioFeedback');
+    const index = store.index('timestamp');
+    return new Promise((resolve, reject) => {
+      const request = index.openCursor(null, 'prev'); // Most recent first
+      const results = [];
+      request.onsuccess = (event) => {
+        const cursor = event.target.result;
+        if (cursor && results.length < limit) {
+          results.push(cursor.value);
+          cursor.continue();
+        } else {
+          resolve(results);
+        }
+      };
+      request.onerror = () => reject(request.error);
+    });
+  }
+  // Generate a simple hash for audio content
+  async hashAudioFile(file) {
+    const arrayBuffer = await file.arrayBuffer();
+    const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer);
+    const hashArray = Array.from(new Uint8Array(hashBuffer));
+    return hashArray.map(b => b.toString(16).padStart(2, '0')).join('').substring(0, 16);
+  }
+  async clearAllData() {
+    if (!this.db) await this.initialize();
+    const transaction = this.db.transaction(['audioFeedback', 'tagFeedback', 'customTags'], 'readwrite');
+    await Promise.all([
+      new Promise((resolve, reject) => {
+        const request = transaction.objectStore('audioFeedback').clear();
+        request.onsuccess = () => resolve();
+        request.onerror = () => reject(request.error);
+      }),
+      new Promise((resolve, reject) => {
+        const request = transaction.objectStore('tagFeedback').clear();
+        request.onsuccess = () => resolve();
+        request.onerror = () => reject(request.error);
+      }),
+      new Promise((resolve, reject) => {
+        const request = transaction.objectStore('customTags').clear();
+        request.onsuccess = () => resolve();
+        request.onerror = () => reject(request.error);
+      })
+    ]);
+  }
+}
+export default UserFeedbackStore;