Spaces:

eduardmtz
/

www

Running

App Files Files Community

eduardmtz commited on Dec 18, 2024

Commit

9849e16

verified ·

1 Parent(s): 632e088

Update entrenament-pdf.html

Browse files

Files changed (1) hide show

entrenament-pdf.html +53 -48

entrenament-pdf.html CHANGED Viewed

@@ -8,86 +8,91 @@
     <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
 </head>
 <body>
-    <h1>Train a Model from PDF Files</h1>
-    <a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
-    <a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
-    <br><br>
-    <input type="file" id="pdfFiles" multiple accept="application/pdf">
     <button id="trainModel">Train Model</button>
-    <pre id="output"></pre>
     <script>
-        const outputElement = document.getElementById('output');
         async function extractTextFromPDF(file) {
             const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
-            let textContent = '';
             for (let i = 1; i <= pdf.numPages; i++) {
                 const page = await pdf.getPage(i);
-                const text = await page.getTextContent();
-                textContent += text.items.map(item => item.str).join(' ');
             }
-            return textContent;
         }
-        async function trainModelFromPDFs(files) {
-            const texts = [];
-            for (const file of files) {
-                const text = await extractTextFromPDF(file);
-                texts.push(text);
-            }
-            // Tokenize text data and prepare features/labels
-            const tokenizer = new Set(texts.join(' ').split(/\s+/));
-            const tokenToIndex = {};
-            let index = 0;
-            for (const token of tokenizer) {
-                tokenToIndex[token] = index++;
-            }
-            const data = texts.map(text => {
-                const tokens = text.split(/\s+/);
-                return tokens.map(token => tokenToIndex[token] || 0);
-            });
-            // Define and train a simple TensorFlow model
-            const model = tf.sequential();
-            model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [data[0].length]}));
-            model.add(tf.layers.dense({units: 1, activation: 'sigmoid'}));
-            model.compile({optimizer: 'adam', loss: 'binaryCrossentropy'});
-            const inputs = tf.tensor2d(data);
-            const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training
             await model.fit(inputs, labels, {
                 epochs: 10,
                 callbacks: {
                     onEpochEnd: (epoch, logs) => {
-                        outputElement.textContent += `Epoch ${epoch + 1}: loss = ${logs.loss}\n`;
                     }
                 }
             });
-            outputElement.textContent += 'Training complete!';
-            return model;
         }
         document.getElementById('trainModel').addEventListener('click', async () => {
-            const files = document.getElementById('pdfFiles').files;
-            if (files.length === 0) {
-                alert('Please select PDF files to train the model.');
                 return;
             }
-            outputElement.textContent = 'Training model...\n';
-            const model = await trainModelFromPDFs(files);
-            // Save the model for later use
-            await model.save('localstorage://pdf-trained-model');
-            outputElement.textContent += '\nModel saved locally!';
         });
     </script>
 </body>
 </html>

     <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
 </head>
 <body>
+    <h1>Train Model from PDFs</h1>
+    <input type="file" id="fileInput" multiple>
     <button id="trainModel">Train Model</button>
+    <pre id="status"></pre>
     <script>
         async function extractTextFromPDF(file) {
             const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
+            let text = '';
             for (let i = 1; i <= pdf.numPages; i++) {
                 const page = await pdf.getPage(i);
+                const content = await page.getTextContent();
+                text += content.items.map(item => item.str).join(' ') + ' ';
             }
+            return text;
         }
+        async function trainModel(data) {
+            const model = tf.sequential();
+            model.add(tf.layers.dense({
+                units: 128,
+                activation: 'relu',
+                inputShape: [data[0].length]
+            }));
+            model.add(tf.layers.dense({ units: 64, activation: 'relu' }));
+            model.add(tf.layers.dense({ units: 1, activation: 'sigmoid' }));
+            model.compile({
+                optimizer: 'adam',
+                loss: 'binaryCrossentropy',
+                metrics: ['accuracy']
+            });
+            const inputs = tf.tensor2d(data.map(d => d.input));
+            const labels = tf.tensor1d(data.map(d => d.label));
+            document.getElementById('status').textContent = 'Training the model...';
             await model.fit(inputs, labels, {
                 epochs: 10,
                 callbacks: {
                     onEpochEnd: (epoch, logs) => {
+                        console.log(`Epoch ${epoch}: loss = ${logs.loss}`);
+                        document.getElementById('status').textContent = `Epoch ${epoch + 1}: Loss = ${logs.loss}`;
                     }
                 }
             });
+            document.getElementById('status').textContent = 'Saving the model to IndexedDB...';
+            try {
+                await model.save('indexeddb://pdf-trained-model');
+                document.getElementById('status').textContent = 'Model saved successfully in IndexedDB!';
+            } catch (err) {
+                document.getElementById('status').textContent = 'Error saving the model: ' + err.message;
+                console.error('Error saving the model:', err);
+            }
         }
         document.getElementById('trainModel').addEventListener('click', async () => {
+            const files = document.getElementById('fileInput').files;
+            if (!files.length) {
+                document.getElementById('status').textContent = 'Please select PDF files to train the model.';
                 return;
             }
+            const data = [];
+            document.getElementById('status').textContent = 'Extracting text from PDFs...';
+            for (const file of files) {
+                const text = await extractTextFromPDF(file);
+                const tokens = text.split(/\s+/).map(word => word.length); // Example: using word lengths as features
+                data.push({
+                    input: tokens.slice(0, 10), // Use the first 10 tokens as input features
+                    label: 1 // Example label (adjust as needed for your use case)
+                });
+            }
+            document.getElementById('status').textContent = 'Training the model...';
+            await trainModel(data);
         });
     </script>
 </body>
 </html>