www / entrenament-pdf.html
eduardmtz's picture
Update entrenament-pdf.html
e2bdc77 verified
raw
history blame
3.6 kB
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Train Model from PDFs</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
</head>
<body>
<h1>Train a Model from PDF Files</h1>
<a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
<a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
<br>
<input type="file" id="pdfFiles" multiple accept="application/pdf">
<button id="trainModel">Train Model</button>
<pre id="output"></pre>
<script>
const outputElement = document.getElementById('output');
async function extractTextFromPDF(file) {
const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
let textContent = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const text = await page.getTextContent();
textContent += text.items.map(item => item.str).join(' ');
}
return textContent;
}
async function trainModelFromPDFs(files) {
const texts = [];
for (const file of files) {
const text = await extractTextFromPDF(file);
texts.push(text);
}
// Tokenize text data and prepare features/labels
const tokenizer = new Set(texts.join(' ').split(/\s+/));
const tokenToIndex = {};
let index = 0;
for (const token of tokenizer) {
tokenToIndex[token] = index++;
}
const data = texts.map(text => {
const tokens = text.split(/\s+/);
return tokens.map(token => tokenToIndex[token] || 0);
});
// Define and train a simple TensorFlow model
const model = tf.sequential();
model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [data[0].length]}));
model.add(tf.layers.dense({units: 1, activation: 'sigmoid'}));
model.compile({optimizer: 'adam', loss: 'binaryCrossentropy'});
const inputs = tf.tensor2d(data);
const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training
await model.fit(inputs, labels, {
epochs: 10,
callbacks: {
onEpochEnd: (epoch, logs) => {
outputElement.textContent += `Epoch ${epoch + 1}: loss = ${logs.loss}\n`;
}
}
});
outputElement.textContent += 'Training complete!';
return model;
}
document.getElementById('trainModel').addEventListener('click', async () => {
const files = document.getElementById('pdfFiles').files;
if (files.length === 0) {
alert('Please select PDF files to train the model.');
return;
}
outputElement.textContent = 'Training model...\n';
const model = await trainModelFromPDFs(files);
// Save the model for later use
await model.save('localstorage://pdf-trained-model');
outputElement.textContent += '\nModel saved locally!';
});
</script>
</body>
</html>