|
import { pipeline, AutoProcessor, ClapAudioModelWithProjection } from '@xenova/transformers'; |
|
|
|
class CLAPProcessor { |
|
constructor() { |
|
this.model = null; |
|
this.processor = null; |
|
this.defaultLabels = [ |
|
'speech', 'music', 'singing', 'guitar', 'piano', 'drums', 'violin', |
|
'trumpet', 'saxophone', 'flute', 'classical music', 'rock music', |
|
'pop music', 'jazz', 'electronic music', 'ambient', 'nature sounds', |
|
'rain', 'wind', 'ocean waves', 'birds chirping', 'dog barking', |
|
'cat meowing', 'car engine', 'traffic', 'footsteps', 'door closing', |
|
'applause', 'laughter', 'crying', 'coughing', 'sneezing', |
|
'telephone ringing', 'alarm clock', 'typing', 'water running', |
|
'fire crackling', 'thunder', 'helicopter', 'airplane', 'train', |
|
'motorcycle', 'bell ringing', 'whistle', 'horn', 'siren', |
|
'explosion', 'gunshot', 'silence', 'noise', 'distortion' |
|
]; |
|
} |
|
|
|
async initialize() { |
|
if (this.model && this.processor) return; |
|
|
|
try { |
|
|
|
this.processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused'); |
|
this.model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); |
|
|
|
console.log('CLAP model loaded successfully'); |
|
} catch (error) { |
|
console.error('Failed to load CLAP model:', error); |
|
throw error; |
|
} |
|
} |
|
|
|
async processAudio(audioBuffer) { |
|
if (!this.model || !this.processor) { |
|
await this.initialize(); |
|
} |
|
|
|
try { |
|
|
|
const audio = await this.preprocessAudio(audioBuffer); |
|
|
|
|
|
const audioInputs = await this.processor(audio); |
|
const audioFeatures = await this.model.get_audio_features(audioInputs); |
|
|
|
|
|
const textInputs = await this.processor.text(this.defaultLabels); |
|
const textFeatures = await this.model.get_text_features(textInputs); |
|
|
|
|
|
const similarities = await this.calculateSimilarities(audioFeatures, textFeatures); |
|
|
|
|
|
return this.getTopTags(similarities, 5); |
|
} catch (error) { |
|
console.error('Error processing audio:', error); |
|
throw error; |
|
} |
|
} |
|
|
|
async preprocessAudio(audioBuffer) { |
|
|
|
let audioData; |
|
if (audioBuffer.numberOfChannels > 1) { |
|
audioData = new Float32Array(audioBuffer.length); |
|
for (let i = 0; i < audioBuffer.length; i++) { |
|
let sum = 0; |
|
for (let channel = 0; channel < audioBuffer.numberOfChannels; channel++) { |
|
sum += audioBuffer.getChannelData(channel)[i]; |
|
} |
|
audioData[i] = sum / audioBuffer.numberOfChannels; |
|
} |
|
} else { |
|
audioData = audioBuffer.getChannelData(0); |
|
} |
|
|
|
|
|
const targetSampleRate = 48000; |
|
if (audioBuffer.sampleRate !== targetSampleRate) { |
|
audioData = await this.resampleAudio(audioData, audioBuffer.sampleRate, targetSampleRate); |
|
} |
|
|
|
return audioData; |
|
} |
|
|
|
async resampleAudio(audioData, originalRate, targetRate) { |
|
|
|
const ratio = originalRate / targetRate; |
|
const newLength = Math.round(audioData.length / ratio); |
|
const resampled = new Float32Array(newLength); |
|
|
|
for (let i = 0; i < newLength; i++) { |
|
const originalIndex = i * ratio; |
|
const indexFloor = Math.floor(originalIndex); |
|
const indexCeil = Math.min(indexFloor + 1, audioData.length - 1); |
|
const fraction = originalIndex - indexFloor; |
|
|
|
resampled[i] = audioData[indexFloor] * (1 - fraction) + audioData[indexCeil] * fraction; |
|
} |
|
|
|
return resampled; |
|
} |
|
|
|
async calculateSimilarities(audioFeatures, textFeatures) { |
|
|
|
const audioVector = audioFeatures.data; |
|
const similarities = []; |
|
|
|
for (let i = 0; i < this.defaultLabels.length; i++) { |
|
const textVector = textFeatures.data.slice( |
|
i * audioVector.length, |
|
(i + 1) * audioVector.length |
|
); |
|
|
|
const similarity = this.cosineSimilarity(audioVector, textVector); |
|
similarities.push(similarity); |
|
} |
|
|
|
return similarities; |
|
} |
|
|
|
cosineSimilarity(vecA, vecB) { |
|
let dotProduct = 0; |
|
let normA = 0; |
|
let normB = 0; |
|
|
|
for (let i = 0; i < vecA.length; i++) { |
|
dotProduct += vecA[i] * vecB[i]; |
|
normA += vecA[i] * vecA[i]; |
|
normB += vecB[i] * vecB[i]; |
|
} |
|
|
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); |
|
} |
|
|
|
getTopTags(similarities, topK = 5) { |
|
const tagged = this.defaultLabels.map((label, index) => ({ |
|
label, |
|
confidence: Math.max(0, similarities[index]) |
|
})); |
|
|
|
return tagged |
|
.sort((a, b) => b.confidence - a.confidence) |
|
.slice(0, topK); |
|
} |
|
|
|
|
|
async fileToAudioBuffer(file) { |
|
const arrayBuffer = await file.arrayBuffer(); |
|
const audioContext = new (window.AudioContext || window.webkitAudioContext)(); |
|
return await audioContext.decodeAudioData(arrayBuffer); |
|
} |
|
} |
|
|
|
export default CLAPProcessor; |