Spaces:
Running
Running
Update entrenament-pdf.html
Browse files- entrenament-pdf.html +53 -48
entrenament-pdf.html
CHANGED
@@ -8,86 +8,91 @@
|
|
8 |
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
|
9 |
</head>
|
10 |
<body>
|
11 |
-
<h1>Train
|
12 |
-
<
|
13 |
-
<a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
|
14 |
-
<br><br>
|
15 |
-
<input type="file" id="pdfFiles" multiple accept="application/pdf">
|
16 |
<button id="trainModel">Train Model</button>
|
17 |
-
|
18 |
-
<pre id="output"></pre>
|
19 |
|
20 |
<script>
|
21 |
-
const outputElement = document.getElementById('output');
|
22 |
-
|
23 |
async function extractTextFromPDF(file) {
|
24 |
const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
|
25 |
-
let
|
26 |
for (let i = 1; i <= pdf.numPages; i++) {
|
27 |
const page = await pdf.getPage(i);
|
28 |
-
const
|
29 |
-
|
30 |
}
|
31 |
-
return
|
32 |
}
|
33 |
|
34 |
-
async function
|
35 |
-
const
|
36 |
-
for (const file of files) {
|
37 |
-
const text = await extractTextFromPDF(file);
|
38 |
-
texts.push(text);
|
39 |
-
}
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
tokenToIndex[token] = index++;
|
47 |
-
}
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
return tokens.map(token => tokenToIndex[token] || 0);
|
52 |
-
});
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
|
59 |
-
|
|
|
60 |
|
61 |
-
|
62 |
-
const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training
|
63 |
|
64 |
await model.fit(inputs, labels, {
|
65 |
epochs: 10,
|
66 |
callbacks: {
|
67 |
onEpochEnd: (epoch, logs) => {
|
68 |
-
|
|
|
69 |
}
|
70 |
}
|
71 |
});
|
72 |
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
}
|
76 |
|
77 |
document.getElementById('trainModel').addEventListener('click', async () => {
|
78 |
-
const files = document.getElementById('
|
79 |
-
if (files.length
|
80 |
-
|
81 |
return;
|
82 |
}
|
83 |
|
84 |
-
|
85 |
-
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
});
|
91 |
</script>
|
92 |
</body>
|
93 |
</html>
|
|
|
|
8 |
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
|
9 |
</head>
|
10 |
<body>
|
11 |
+
<h1>Train Model from PDFs</h1>
|
12 |
+
<input type="file" id="fileInput" multiple>
|
|
|
|
|
|
|
13 |
<button id="trainModel">Train Model</button>
|
14 |
+
<pre id="status"></pre>
|
|
|
15 |
|
16 |
<script>
|
|
|
|
|
17 |
async function extractTextFromPDF(file) {
|
18 |
const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
|
19 |
+
let text = '';
|
20 |
for (let i = 1; i <= pdf.numPages; i++) {
|
21 |
const page = await pdf.getPage(i);
|
22 |
+
const content = await page.getTextContent();
|
23 |
+
text += content.items.map(item => item.str).join(' ') + ' ';
|
24 |
}
|
25 |
+
return text;
|
26 |
}
|
27 |
|
28 |
+
async function trainModel(data) {
|
29 |
+
const model = tf.sequential();
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
model.add(tf.layers.dense({
|
32 |
+
units: 128,
|
33 |
+
activation: 'relu',
|
34 |
+
inputShape: [data[0].length]
|
35 |
+
}));
|
|
|
|
|
36 |
|
37 |
+
model.add(tf.layers.dense({ units: 64, activation: 'relu' }));
|
38 |
+
model.add(tf.layers.dense({ units: 1, activation: 'sigmoid' }));
|
|
|
|
|
39 |
|
40 |
+
model.compile({
|
41 |
+
optimizer: 'adam',
|
42 |
+
loss: 'binaryCrossentropy',
|
43 |
+
metrics: ['accuracy']
|
44 |
+
});
|
45 |
|
46 |
+
const inputs = tf.tensor2d(data.map(d => d.input));
|
47 |
+
const labels = tf.tensor1d(data.map(d => d.label));
|
48 |
|
49 |
+
document.getElementById('status').textContent = 'Training the model...';
|
|
|
50 |
|
51 |
await model.fit(inputs, labels, {
|
52 |
epochs: 10,
|
53 |
callbacks: {
|
54 |
onEpochEnd: (epoch, logs) => {
|
55 |
+
console.log(`Epoch ${epoch}: loss = ${logs.loss}`);
|
56 |
+
document.getElementById('status').textContent = `Epoch ${epoch + 1}: Loss = ${logs.loss}`;
|
57 |
}
|
58 |
}
|
59 |
});
|
60 |
|
61 |
+
document.getElementById('status').textContent = 'Saving the model to IndexedDB...';
|
62 |
+
|
63 |
+
try {
|
64 |
+
await model.save('indexeddb://pdf-trained-model');
|
65 |
+
document.getElementById('status').textContent = 'Model saved successfully in IndexedDB!';
|
66 |
+
} catch (err) {
|
67 |
+
document.getElementById('status').textContent = 'Error saving the model: ' + err.message;
|
68 |
+
console.error('Error saving the model:', err);
|
69 |
+
}
|
70 |
}
|
71 |
|
72 |
document.getElementById('trainModel').addEventListener('click', async () => {
|
73 |
+
const files = document.getElementById('fileInput').files;
|
74 |
+
if (!files.length) {
|
75 |
+
document.getElementById('status').textContent = 'Please select PDF files to train the model.';
|
76 |
return;
|
77 |
}
|
78 |
|
79 |
+
const data = [];
|
80 |
+
document.getElementById('status').textContent = 'Extracting text from PDFs...';
|
81 |
|
82 |
+
for (const file of files) {
|
83 |
+
const text = await extractTextFromPDF(file);
|
84 |
+
const tokens = text.split(/\s+/).map(word => word.length); // Example: using word lengths as features
|
85 |
+
|
86 |
+
data.push({
|
87 |
+
input: tokens.slice(0, 10), // Use the first 10 tokens as input features
|
88 |
+
label: 1 // Example label (adjust as needed for your use case)
|
89 |
+
});
|
90 |
+
}
|
91 |
+
|
92 |
+
document.getElementById('status').textContent = 'Training the model...';
|
93 |
+
await trainModel(data);
|
94 |
});
|
95 |
</script>
|
96 |
</body>
|
97 |
</html>
|
98 |
+
|