Spaces:

eduardmtz
/

www

Running

App Files Files Community

www / entrenament-pdf.html

eduardmtz

Update entrenament-pdf.html

e2bdc77 verified 5 months ago

raw

history blame

3.6 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Train Model from PDFs</title>
	<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
	<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
	</head>
	<body>
	<h1>Train a Model from PDF Files</h1>
	<a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
	<a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
	<br>
	<input type="file" id="pdfFiles" multiple accept="application/pdf">
	<button id="trainModel">Train Model</button>

	<pre id="output"></pre>

	<script>
	const outputElement = document.getElementById('output');

	async function extractTextFromPDF(file) {
	const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
	let textContent = '';
	for (let i = 1; i <= pdf.numPages; i++) {
	const page = await pdf.getPage(i);
	const text = await page.getTextContent();
	textContent += text.items.map(item => item.str).join(' ');
	}
	return textContent;
	}

	async function trainModelFromPDFs(files) {
	const texts = [];
	for (const file of files) {
	const text = await extractTextFromPDF(file);
	texts.push(text);
	}

	// Tokenize text data and prepare features/labels
	const tokenizer = new Set(texts.join(' ').split(/\s+/));
	const tokenToIndex = {};
	let index = 0;
	for (const token of tokenizer) {
	tokenToIndex[token] = index++;
	}

	const data = texts.map(text => {
	const tokens = text.split(/\s+/);
	return tokens.map(token => tokenToIndex[token] \|\| 0);
	});

	// Define and train a simple TensorFlow model
	const model = tf.sequential();
	model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [data[0].length]}));
	model.add(tf.layers.dense({units: 1, activation: 'sigmoid'}));

	model.compile({optimizer: 'adam', loss: 'binaryCrossentropy'});

	const inputs = tf.tensor2d(data);
	const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training

	await model.fit(inputs, labels, {
	epochs: 10,
	callbacks: {
	onEpochEnd: (epoch, logs) => {
	outputElement.textContent += `Epoch ${epoch + 1}: loss = ${logs.loss}\n`;
	}
	}
	});

	outputElement.textContent += 'Training complete!';
	return model;
	}

	document.getElementById('trainModel').addEventListener('click', async () => {
	const files = document.getElementById('pdfFiles').files;
	if (files.length === 0) {
	alert('Please select PDF files to train the model.');
	return;
	}

	outputElement.textContent = 'Training model...\n';
	const model = await trainModelFromPDFs(files);

	// Save the model for later use
	await model.save('localstorage://pdf-trained-model');
	outputElement.textContent += '\nModel saved locally!';
	});
	</script>
	</body>
	</html>