Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Train Model from PDFs</title> | |
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script> | |
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script> | |
</head> | |
<body> | |
<h1>Train a Model from PDF Files</h1> | |
<a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a> | |
<a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a> | |
<br> | |
<input type="file" id="pdfFiles" multiple accept="application/pdf"> | |
<button id="trainModel">Train Model</button> | |
<pre id="output"></pre> | |
<script> | |
const outputElement = document.getElementById('output'); | |
async function extractTextFromPDF(file) { | |
const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise; | |
let textContent = ''; | |
for (let i = 1; i <= pdf.numPages; i++) { | |
const page = await pdf.getPage(i); | |
const text = await page.getTextContent(); | |
textContent += text.items.map(item => item.str).join(' '); | |
} | |
return textContent; | |
} | |
async function trainModelFromPDFs(files) { | |
const texts = []; | |
for (const file of files) { | |
const text = await extractTextFromPDF(file); | |
texts.push(text); | |
} | |
// Tokenize text data and prepare features/labels | |
const tokenizer = new Set(texts.join(' ').split(/\s+/)); | |
const tokenToIndex = {}; | |
let index = 0; | |
for (const token of tokenizer) { | |
tokenToIndex[token] = index++; | |
} | |
const data = texts.map(text => { | |
const tokens = text.split(/\s+/); | |
return tokens.map(token => tokenToIndex[token] || 0); | |
}); | |
// Define and train a simple TensorFlow model | |
const model = tf.sequential(); | |
model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [data[0].length]})); | |
model.add(tf.layers.dense({units: 1, activation: 'sigmoid'})); | |
model.compile({optimizer: 'adam', loss: 'binaryCrossentropy'}); | |
const inputs = tf.tensor2d(data); | |
const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training | |
await model.fit(inputs, labels, { | |
epochs: 10, | |
callbacks: { | |
onEpochEnd: (epoch, logs) => { | |
outputElement.textContent += `Epoch ${epoch + 1}: loss = ${logs.loss}\n`; | |
} | |
} | |
}); | |
outputElement.textContent += 'Training complete!'; | |
return model; | |
} | |
document.getElementById('trainModel').addEventListener('click', async () => { | |
const files = document.getElementById('pdfFiles').files; | |
if (files.length === 0) { | |
alert('Please select PDF files to train the model.'); | |
return; | |
} | |
outputElement.textContent = 'Training model...\n'; | |
const model = await trainModelFromPDFs(files); | |
// Save the model for later use | |
await model.save('localstorage://pdf-trained-model'); | |
outputElement.textContent += '\nModel saved locally!'; | |
}); | |
</script> | |
</body> | |
</html> | |