Prathamesh Sarjerao Vaidya commited on
Commit
65fbbac
·
1 Parent(s): 65f46e8

made changes

Browse files
.github/workflows/check.yml CHANGED
@@ -20,28 +20,77 @@ jobs:
20
  runs-on: ubuntu-latest
21
  needs: check-file-size
22
  if: github.event_name == 'pull_request'
 
 
23
  steps:
24
  - uses: actions/checkout@v3
25
  with:
26
  lfs: true
 
27
 
28
  - name: Pull LFS files
29
  run: |
30
  git lfs install
31
  git lfs pull
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  - name: Setup Python
34
  uses: actions/setup-python@v4
35
  with:
36
  python-version: '3.11'
37
 
 
38
  - name: Setup system dependencies
39
- run: chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
 
 
 
40
 
 
41
  - name: Convert MD to PDF
42
- run: chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
 
 
 
43
 
 
44
  - name: Upload PDF artifacts
 
45
  uses: actions/upload-artifact@v4
46
  with:
47
  name: converted-pdfs
@@ -51,4 +100,10 @@ jobs:
51
  - name: Upload to Google Drive
52
  env:
53
  GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
54
- run: python .github/workflows/scripts/upload_to_drive.py
 
 
 
 
 
 
 
20
  runs-on: ubuntu-latest
21
  needs: check-file-size
22
  if: github.event_name == 'pull_request'
23
+ outputs:
24
+ skip_pdf: ${{ steps.check_md_changes.outputs.skip_pdf }}
25
  steps:
26
  - uses: actions/checkout@v3
27
  with:
28
  lfs: true
29
+ fetch-depth: 0 # Need full history for git diff
30
 
31
  - name: Pull LFS files
32
  run: |
33
  git lfs install
34
  git lfs pull
35
 
36
+ # NEW STEP: Check if MD files were modified
37
+ - name: Check for MD file changes
38
+ id: check_md_changes
39
+ run: |
40
+ echo "Checking for markdown file changes..."
41
+
42
+ # Get the commit message
43
+ COMMIT_MSG="${{ github.event.head_commit.message }}"
44
+ if [ -z "$COMMIT_MSG" ]; then
45
+ COMMIT_MSG=$(git log -1 --pretty=%B)
46
+ fi
47
+ echo "Commit message: $COMMIT_MSG"
48
+
49
+ # Check if commit message indicates MD changes
50
+ MD_IN_COMMIT=$(echo "$COMMIT_MSG" | grep -i "\.md\|markdown\|documentation\|docs\|readme" || true)
51
+
52
+ # Check if any MD files were actually modified in the diff
53
+ if [ "${{ github.event_name }}" = "pull_request" ]; then
54
+ MD_FILES_CHANGED=$(git diff --name-only origin/main...HEAD | grep "\.md$" || true)
55
+ else
56
+ MD_FILES_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep "\.md$" || true)
57
+ fi
58
+
59
+ echo "MD files in commit message: $MD_IN_COMMIT"
60
+ echo "MD files changed: $MD_FILES_CHANGED"
61
+
62
+ # Skip PDF conversion if no MD files changed AND no MD-related keywords in commit
63
+ if [ -z "$MD_FILES_CHANGED" ] && [ -z "$MD_IN_COMMIT" ]; then
64
+ echo "skip_pdf=true" >> $GITHUB_OUTPUT
65
+ echo "Skipping PDF conversion - no MD files modified"
66
+ else
67
+ echo "skip_pdf=false" >> $GITHUB_OUTPUT
68
+ echo "MD files detected - will convert to PDF"
69
+ echo "Changed MD files: $MD_FILES_CHANGED"
70
+ fi
71
+
72
  - name: Setup Python
73
  uses: actions/setup-python@v4
74
  with:
75
  python-version: '3.11'
76
 
77
+ # CONDITIONAL STEP: Only run if MD files changed
78
  - name: Setup system dependencies
79
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
80
+ run: |
81
+ echo "Setting up system dependencies for PDF conversion..."
82
+ chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
83
 
84
+ # CONDITIONAL STEP: Only run if MD files changed
85
  - name: Convert MD to PDF
86
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
87
+ run: |
88
+ echo "Converting MD files to PDF..."
89
+ chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
90
 
91
+ # CONDITIONAL STEP: Only run if MD files changed
92
  - name: Upload PDF artifacts
93
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
94
  uses: actions/upload-artifact@v4
95
  with:
96
  name: converted-pdfs
 
100
  - name: Upload to Google Drive
101
  env:
102
  GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
103
+ run: |
104
+ if [ "${{ steps.check_md_changes.outputs.skip_pdf }}" = "true" ]; then
105
+ echo "Skipped PDF conversion - uploading existing files only"
106
+ else
107
+ echo "Uploading files including new PDFs to Google Drive"
108
+ fi
109
+ python .github/workflows/scripts/upload_to_drive.py
.github/workflows/main.yml CHANGED
@@ -18,18 +18,57 @@ jobs:
18
  git lfs install
19
  git lfs pull
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  - name: Setup Python
22
  uses: actions/setup-python@v4
23
  with:
24
  python-version: '3.11'
25
 
 
26
  - name: Setup system dependencies
27
- run: chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
 
 
 
28
 
 
29
  - name: Convert MD to PDF
30
- run: chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
 
 
 
31
 
 
32
  - name: Upload PDF artifacts
 
33
  uses: actions/upload-artifact@v4
34
  with:
35
  name: converted-pdfs
@@ -42,7 +81,13 @@ jobs:
42
  GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
43
  # Fallback authentication method (Service Account)
44
  GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
45
- run: python .github/workflows/scripts/upload_to_drive.py
 
 
 
 
 
 
46
 
47
  - name: Push to Hugging Face hub
48
  env:
 
18
  git lfs install
19
  git lfs pull
20
 
21
+ # NEW STEP: Check for MD file changes
22
+ - name: Check for MD file changes
23
+ id: check_md_changes
24
+ run: |
25
+ echo "Checking for markdown file changes..."
26
+
27
+ # Get the commit message
28
+ COMMIT_MSG=$(git log -1 --pretty=%B)
29
+ echo "Commit message: $COMMIT_MSG"
30
+
31
+ # Check if commit message indicates MD changes
32
+ MD_IN_COMMIT=$(echo "$COMMIT_MSG" | grep -i "\.md\|markdown\|documentation\|docs\|readme" || true)
33
+
34
+ # Check if any MD files were actually modified in the last commit
35
+ MD_FILES_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep "\.md$" || true)
36
+
37
+ echo "MD files in commit message: $MD_IN_COMMIT"
38
+ echo "MD files changed: $MD_FILES_CHANGED"
39
+
40
+ # Skip PDF conversion if no MD files changed AND no MD-related keywords in commit
41
+ if [ -z "$MD_FILES_CHANGED" ] && [ -z "$MD_IN_COMMIT" ]; then
42
+ echo "skip_pdf=true" >> $GITHUB_OUTPUT
43
+ echo "Skipping PDF conversion - no MD files modified"
44
+ else
45
+ echo "skip_pdf=false" >> $GITHUB_OUTPUT
46
+ echo "MD files detected - will convert to PDF"
47
+ echo "Changed MD files: $MD_FILES_CHANGED"
48
+ fi
49
+
50
  - name: Setup Python
51
  uses: actions/setup-python@v4
52
  with:
53
  python-version: '3.11'
54
 
55
+ # CONDITIONAL STEP: Only run if MD files changed
56
  - name: Setup system dependencies
57
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
58
+ run: |
59
+ echo "Setting up system dependencies for PDF conversion..."
60
+ chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
61
 
62
+ # CONDITIONAL STEP: Only run if MD files changed
63
  - name: Convert MD to PDF
64
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
65
+ run: |
66
+ echo "Converting MD files to PDF..."
67
+ chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
68
 
69
+ # CONDITIONAL STEP: Only run if MD files changed
70
  - name: Upload PDF artifacts
71
+ if: steps.check_md_changes.outputs.skip_pdf == 'false'
72
  uses: actions/upload-artifact@v4
73
  with:
74
  name: converted-pdfs
 
81
  GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
82
  # Fallback authentication method (Service Account)
83
  GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
84
+ run: |
85
+ if [ "${{ steps.check_md_changes.outputs.skip_pdf }}" = "true" ]; then
86
+ echo "Skipped PDF conversion - uploading existing files only"
87
+ else
88
+ echo "Uploading files including new PDFs to Google Drive"
89
+ fi
90
+ python .github/workflows/scripts/upload_to_drive.py
91
 
92
  - name: Push to Hugging Face hub
93
  env:
Dockerfile CHANGED
@@ -26,6 +26,7 @@ RUN apt-get update && apt-get install -y \
26
  libavformat-dev \
27
  libavutil-dev \
28
  libswresample-dev \
 
29
  && rm -rf /var/lib/apt/lists/*
30
 
31
  # Copy requirements first for better caching
@@ -33,6 +34,11 @@ COPY requirements.txt .
33
 
34
  # Install Python dependencies with proper error handling
35
  RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
 
 
 
 
 
36
  pip install --no-cache-dir -r requirements.txt
37
 
38
  # Copy application code
 
26
  libavformat-dev \
27
  libavutil-dev \
28
  libswresample-dev \
29
+ execstack \
30
  && rm -rf /var/lib/apt/lists/*
31
 
32
  # Copy requirements first for better caching
 
34
 
35
  # Install Python dependencies with proper error handling
36
  RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
37
+ # Install ONNX Runtime CPU version specifically
38
+ pip install --no-cache-dir onnxruntime==1.16.3 && \
39
+ # Fix executable stack issue
40
+ find /usr/local/lib/python*/site-packages/onnxruntime -name "*.so" -exec execstack -c {} \; 2>/dev/null || true && \
41
+ # Install other requirements
42
  pip install --no-cache-dir -r requirements.txt
43
 
44
  # Copy application code
model_preloader.py CHANGED
@@ -397,55 +397,85 @@ class ModelPreloader:
397
  except Exception as e:
398
  logger.warning(f"Error saving cache for {model_key}: {e}")
399
 
400
- def load_pyannote_pipeline(self, task_id: str) -> Optional[Pipeline]:
401
  """Load pyannote speaker diarization pipeline with container-safe settings."""
402
  try:
403
  console.print(f"[yellow]Loading pyannote.audio pipeline...[/yellow]")
404
 
 
 
 
 
 
 
 
 
 
 
405
  # Check for HuggingFace token
406
- hf_token = os.getenv('HUGGINGFACE_TOKEN')
407
  if not hf_token:
408
  console.print("[red]Warning: HUGGINGFACE_TOKEN not found. Some models may not be accessible.[/red]")
409
 
410
- # Container-safe pipeline loading with error suppression
411
- with warnings.catch_warnings():
412
- warnings.filterwarnings("ignore", category=UserWarning)
413
- warnings.filterwarnings("ignore", message=".*executable stack.*")
414
-
 
 
 
 
 
 
 
 
 
 
 
415
  pipeline = Pipeline.from_pretrained(
416
  "pyannote/speaker-diarization-3.1",
417
  use_auth_token=hf_token,
418
  cache_dir=str(self.cache_dir / "pyannote")
419
  )
420
 
421
- # Force CPU execution
422
  if hasattr(pipeline, '_models'):
423
  for model_name, model in pipeline._models.items():
424
  if hasattr(model, 'to'):
425
  model.to('cpu')
426
-
427
- console.print(f"[green]SUCCESS: pyannote.audio pipeline loaded successfully on {self.device}[/green]")
428
- return pipeline
 
 
 
 
429
 
430
  except Exception as e:
431
- # Check if it's the expected ONNX Runtime warning
432
- if "executable stack" in str(e).lower():
433
- console.print("[yellow]ONNX Runtime executable stack warning (expected in containers) - continuing...[/yellow]")
434
- # Try alternative loading method
 
435
  try:
436
- import warnings
437
- with warnings.catch_warnings():
438
- warnings.simplefilter("ignore")
439
- pipeline = Pipeline.from_pretrained(
440
- "pyannote/speaker-diarization-3.1",
441
- use_auth_token=hf_token,
442
- cache_dir=str(self.cache_dir / "pyannote")
443
- )
444
- return pipeline
445
- except:
446
- pass
 
 
 
 
 
447
 
448
- console.print(f"[red]ERROR: Failed to load pyannote.audio pipeline: {e}[/red]")
449
  logger.error(f"Pyannote loading failed: {e}")
450
  return None
451
 
 
397
  except Exception as e:
398
  logger.warning(f"Error saving cache for {model_key}: {e}")
399
 
400
+ def load_pyannote_pipeline(self) -> Optional[Pipeline]:
401
  """Load pyannote speaker diarization pipeline with container-safe settings."""
402
  try:
403
  console.print(f"[yellow]Loading pyannote.audio pipeline...[/yellow]")
404
 
405
+ # Fix ONNX Runtime libraries first
406
+ try:
407
+ import subprocess
408
+ subprocess.run([
409
+ 'find', '/usr/local/lib/python*/site-packages/onnxruntime',
410
+ '-name', '*.so', '-exec', 'execstack', '-c', '{}', ';'
411
+ ], capture_output=True, timeout=10, stderr=subprocess.DEVNULL)
412
+ except:
413
+ pass
414
+
415
  # Check for HuggingFace token
416
+ hf_token = os.getenv('HUGGINGFACE_TOKEN') or os.getenv('HF_TOKEN')
417
  if not hf_token:
418
  console.print("[red]Warning: HUGGINGFACE_TOKEN not found. Some models may not be accessible.[/red]")
419
 
420
+ # Suppress all warnings during pipeline loading
421
+ import warnings
422
+ import logging
423
+
424
+ # Temporarily disable all warnings and logging
425
+ old_warning_filters = warnings.filters[:]
426
+ warnings.filterwarnings("ignore")
427
+
428
+ # Disable ONNX Runtime logging
429
+ os.environ['ORT_LOGGING_LEVEL'] = '3' # ERROR only
430
+
431
+ # Disable other verbose logging
432
+ logging.getLogger('onnxruntime').setLevel(logging.ERROR)
433
+ logging.getLogger('transformers').setLevel(logging.ERROR)
434
+
435
+ try:
436
  pipeline = Pipeline.from_pretrained(
437
  "pyannote/speaker-diarization-3.1",
438
  use_auth_token=hf_token,
439
  cache_dir=str(self.cache_dir / "pyannote")
440
  )
441
 
442
+ # Force CPU execution for all models in pipeline
443
  if hasattr(pipeline, '_models'):
444
  for model_name, model in pipeline._models.items():
445
  if hasattr(model, 'to'):
446
  model.to('cpu')
447
+
448
+ console.print(f"[green]SUCCESS: pyannote.audio pipeline loaded successfully on CPU[/green]")
449
+ return pipeline
450
+
451
+ finally:
452
+ # Restore warning filters
453
+ warnings.filters[:] = old_warning_filters
454
 
455
  except Exception as e:
456
+ error_msg = str(e).lower()
457
+ if "executable stack" in error_msg or "onnxruntime" in error_msg:
458
+ console.print("[yellow]ONNX Runtime container warning (attempting workaround)...[/yellow]")
459
+
460
+ # Try alternative approach - load without ONNX-dependent components
461
  try:
462
+ # Try loading with CPU-only execution providers
463
+ import onnxruntime as ort
464
+ ort.set_default_logger_severity(4) # FATAL only
465
+
466
+ pipeline = Pipeline.from_pretrained(
467
+ "pyannote/speaker-diarization-3.1",
468
+ use_auth_token=hf_token,
469
+ cache_dir=str(self.cache_dir / "pyannote")
470
+ )
471
+ console.print(f"[green]SUCCESS: pyannote.audio loaded with workaround[/green]")
472
+ return pipeline
473
+
474
+ except Exception as e2:
475
+ console.print(f"[red]ERROR: All pyannote loading methods failed: {e2}[/red]")
476
+ else:
477
+ console.print(f"[red]ERROR: Failed to load pyannote.audio pipeline: {e}[/red]")
478
 
 
479
  logger.error(f"Pyannote loading failed: {e}")
480
  return None
481
 
startup.py CHANGED
@@ -70,15 +70,33 @@ def preload_models():
70
  import model_preloader
71
  logger.info('✅ Model preloader module found')
72
 
73
- # Set environment variables to handle onnxruntime issues
74
  env = os.environ.copy()
75
  env.update({
76
  'ORT_DYLIB_DEFAULT_OPTIONS': 'DisableExecutablePageAllocator=1',
77
  'ONNXRUNTIME_EXECUTION_PROVIDERS': 'CPUExecutionProvider',
 
78
  'TF_ENABLE_ONEDNN_OPTS': '0',
79
- 'OMP_NUM_THREADS': '1'
 
 
 
 
 
 
 
80
  })
81
 
 
 
 
 
 
 
 
 
 
 
82
  # Try to run the preloader
83
  result = subprocess.run(
84
  ['python', 'model_preloader.py'],
@@ -96,18 +114,22 @@ def preload_models():
96
  else:
97
  logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
98
  if result.stderr:
99
- # Check if it's the onnxruntime issue
100
- if 'cannot enable executable stack' in result.stderr:
101
- logger.warning('⚠️ ONNX Runtime executable stack issue detected - this is expected in containers')
102
- else:
103
- logger.warning(f'Preloader stderr: {result.stderr[:500]}...')
 
 
 
104
  return False
105
 
106
  except subprocess.TimeoutExpired:
107
  logger.warning('⚠️ Model preloading timed out, continuing...')
108
  return False
109
  except Exception as e:
110
- logger.warning(f'⚠️ Model preloading failed: {e}')
 
111
  return False
112
 
113
  def start_web_app():
 
70
  import model_preloader
71
  logger.info('✅ Model preloader module found')
72
 
73
+ # Set comprehensive environment variables for ONNX Runtime
74
  env = os.environ.copy()
75
  env.update({
76
  'ORT_DYLIB_DEFAULT_OPTIONS': 'DisableExecutablePageAllocator=1',
77
  'ONNXRUNTIME_EXECUTION_PROVIDERS': 'CPUExecutionProvider',
78
+ 'ORT_DISABLE_TLS_ARENA': '1',
79
  'TF_ENABLE_ONEDNN_OPTS': '0',
80
+ 'OMP_NUM_THREADS': '1',
81
+ 'MKL_NUM_THREADS': '1',
82
+ 'NUMBA_NUM_THREADS': '1',
83
+ 'TOKENIZERS_PARALLELISM': 'false',
84
+ 'MALLOC_ARENA_MAX': '2',
85
+ # Additional ONNX Runtime fixes
86
+ 'ONNXRUNTIME_LOG_SEVERITY_LEVEL': '3',
87
+ 'ORT_LOGGING_LEVEL': 'WARNING'
88
  })
89
 
90
+ # Try to fix ONNX Runtime libraries before running preloader
91
+ try:
92
+ import subprocess
93
+ subprocess.run([
94
+ 'find', '/usr/local/lib/python*/site-packages/onnxruntime',
95
+ '-name', '*.so', '-exec', 'execstack', '-c', '{}', ';'
96
+ ], capture_output=True, timeout=30)
97
+ except:
98
+ pass # Continue if execstack fix fails
99
+
100
  # Try to run the preloader
101
  result = subprocess.run(
102
  ['python', 'model_preloader.py'],
 
114
  else:
115
  logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
116
  if result.stderr:
117
+ # Filter out expected ONNX warnings
118
+ stderr_lines = result.stderr.split('\n')
119
+ important_errors = [line for line in stderr_lines
120
+ if 'executable stack' not in line.lower()
121
+ and 'onnxruntime' not in line.lower()
122
+ and line.strip()]
123
+ if important_errors:
124
+ logger.warning(f'Important errors: {important_errors[:3]}')
125
  return False
126
 
127
  except subprocess.TimeoutExpired:
128
  logger.warning('⚠️ Model preloading timed out, continuing...')
129
  return False
130
  except Exception as e:
131
+ if 'executable stack' not in str(e).lower():
132
+ logger.warning(f'⚠️ Model preloading failed: {e}')
133
  return False
134
 
135
  def start_web_app():