eduardmtz commited on
Commit
9849e16
·
verified ·
1 Parent(s): 632e088

Update entrenament-pdf.html

Browse files
Files changed (1) hide show
  1. entrenament-pdf.html +53 -48
entrenament-pdf.html CHANGED
@@ -8,86 +8,91 @@
8
  <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
9
  </head>
10
  <body>
11
- <h1>Train a Model from PDF Files</h1>
12
- <a href="entrenament-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Entrenament PDF</a>
13
- <a href="preguntar-pdf.html" style="margin:5px;padding: 5px; border:1px solid green">Preguntar PDF</a>
14
- <br><br>
15
- <input type="file" id="pdfFiles" multiple accept="application/pdf">
16
  <button id="trainModel">Train Model</button>
17
-
18
- <pre id="output"></pre>
19
 
20
  <script>
21
- const outputElement = document.getElementById('output');
22
-
23
  async function extractTextFromPDF(file) {
24
  const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
25
- let textContent = '';
26
  for (let i = 1; i <= pdf.numPages; i++) {
27
  const page = await pdf.getPage(i);
28
- const text = await page.getTextContent();
29
- textContent += text.items.map(item => item.str).join(' ');
30
  }
31
- return textContent;
32
  }
33
 
34
- async function trainModelFromPDFs(files) {
35
- const texts = [];
36
- for (const file of files) {
37
- const text = await extractTextFromPDF(file);
38
- texts.push(text);
39
- }
40
 
41
- // Tokenize text data and prepare features/labels
42
- const tokenizer = new Set(texts.join(' ').split(/\s+/));
43
- const tokenToIndex = {};
44
- let index = 0;
45
- for (const token of tokenizer) {
46
- tokenToIndex[token] = index++;
47
- }
48
 
49
- const data = texts.map(text => {
50
- const tokens = text.split(/\s+/);
51
- return tokens.map(token => tokenToIndex[token] || 0);
52
- });
53
 
54
- // Define and train a simple TensorFlow model
55
- const model = tf.sequential();
56
- model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [data[0].length]}));
57
- model.add(tf.layers.dense({units: 1, activation: 'sigmoid'}));
 
58
 
59
- model.compile({optimizer: 'adam', loss: 'binaryCrossentropy'});
 
60
 
61
- const inputs = tf.tensor2d(data);
62
- const labels = tf.tensor1d(new Array(data.length).fill(1)); // Dummy labels for training
63
 
64
  await model.fit(inputs, labels, {
65
  epochs: 10,
66
  callbacks: {
67
  onEpochEnd: (epoch, logs) => {
68
- outputElement.textContent += `Epoch ${epoch + 1}: loss = ${logs.loss}\n`;
 
69
  }
70
  }
71
  });
72
 
73
- outputElement.textContent += 'Training complete!';
74
- return model;
 
 
 
 
 
 
 
75
  }
76
 
77
  document.getElementById('trainModel').addEventListener('click', async () => {
78
- const files = document.getElementById('pdfFiles').files;
79
- if (files.length === 0) {
80
- alert('Please select PDF files to train the model.');
81
  return;
82
  }
83
 
84
- outputElement.textContent = 'Training model...\n';
85
- const model = await trainModelFromPDFs(files);
86
 
87
- // Save the model for later use
88
- await model.save('localstorage://pdf-trained-model');
89
- outputElement.textContent += '\nModel saved locally!';
 
 
 
 
 
 
 
 
 
90
  });
91
  </script>
92
  </body>
93
  </html>
 
 
8
  <script src="https://cdn.jsdelivr.net/npm/pdfjs-dist/build/pdf.min.js"></script>
9
  </head>
10
  <body>
11
+ <h1>Train Model from PDFs</h1>
12
+ <input type="file" id="fileInput" multiple>
 
 
 
13
  <button id="trainModel">Train Model</button>
14
+ <pre id="status"></pre>
 
15
 
16
  <script>
 
 
17
  async function extractTextFromPDF(file) {
18
  const pdf = await pdfjsLib.getDocument(await file.arrayBuffer()).promise;
19
+ let text = '';
20
  for (let i = 1; i <= pdf.numPages; i++) {
21
  const page = await pdf.getPage(i);
22
+ const content = await page.getTextContent();
23
+ text += content.items.map(item => item.str).join(' ') + ' ';
24
  }
25
+ return text;
26
  }
27
 
28
+ async function trainModel(data) {
29
+ const model = tf.sequential();
 
 
 
 
30
 
31
+ model.add(tf.layers.dense({
32
+ units: 128,
33
+ activation: 'relu',
34
+ inputShape: [data[0].length]
35
+ }));
 
 
36
 
37
+ model.add(tf.layers.dense({ units: 64, activation: 'relu' }));
38
+ model.add(tf.layers.dense({ units: 1, activation: 'sigmoid' }));
 
 
39
 
40
+ model.compile({
41
+ optimizer: 'adam',
42
+ loss: 'binaryCrossentropy',
43
+ metrics: ['accuracy']
44
+ });
45
 
46
+ const inputs = tf.tensor2d(data.map(d => d.input));
47
+ const labels = tf.tensor1d(data.map(d => d.label));
48
 
49
+ document.getElementById('status').textContent = 'Training the model...';
 
50
 
51
  await model.fit(inputs, labels, {
52
  epochs: 10,
53
  callbacks: {
54
  onEpochEnd: (epoch, logs) => {
55
+ console.log(`Epoch ${epoch}: loss = ${logs.loss}`);
56
+ document.getElementById('status').textContent = `Epoch ${epoch + 1}: Loss = ${logs.loss}`;
57
  }
58
  }
59
  });
60
 
61
+ document.getElementById('status').textContent = 'Saving the model to IndexedDB...';
62
+
63
+ try {
64
+ await model.save('indexeddb://pdf-trained-model');
65
+ document.getElementById('status').textContent = 'Model saved successfully in IndexedDB!';
66
+ } catch (err) {
67
+ document.getElementById('status').textContent = 'Error saving the model: ' + err.message;
68
+ console.error('Error saving the model:', err);
69
+ }
70
  }
71
 
72
  document.getElementById('trainModel').addEventListener('click', async () => {
73
+ const files = document.getElementById('fileInput').files;
74
+ if (!files.length) {
75
+ document.getElementById('status').textContent = 'Please select PDF files to train the model.';
76
  return;
77
  }
78
 
79
+ const data = [];
80
+ document.getElementById('status').textContent = 'Extracting text from PDFs...';
81
 
82
+ for (const file of files) {
83
+ const text = await extractTextFromPDF(file);
84
+ const tokens = text.split(/\s+/).map(word => word.length); // Example: using word lengths as features
85
+
86
+ data.push({
87
+ input: tokens.slice(0, 10), // Use the first 10 tokens as input features
88
+ label: 1 // Example label (adjust as needed for your use case)
89
+ });
90
+ }
91
+
92
+ document.getElementById('status').textContent = 'Training the model...';
93
+ await trainModel(data);
94
  });
95
  </script>
96
  </body>
97
  </html>
98
+