Fix diarization tensor mismatch + fix sidecar build triggers

Diarization: Audio.crop patch now pads short segments with zeros to match the expected duration. pyannote batches embeddings with vstack which requires uniform tensor sizes — the last segment of a file can be shorter than the 10s window. CI: Reordered sidecar workflow to check for python/ changes FIRST, before bumping version or configuring git. All subsequent steps are gated on has_changes. This prevents unnecessary version bumps and build runs when only app code changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 18:30:40 -07:00
parent 6f9dc9a95e
commit 879a1f3fd6
2 changed files with 35 additions and 19 deletions
@@ -18,14 +18,34 @@ jobs:
    steps:
      - uses: actions/checkout@v4
        with:
-          fetch-depth: 0
+          fetch-depth: 2
      - name: Check for python changes
        id: check_changes
        run: |
          # If triggered by workflow_dispatch, always build
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "has_changes=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Check if any python/ files changed in this commit
          CHANGED=$(git diff --name-only HEAD~1 HEAD -- python/ 2>/dev/null || echo "")
          if [ -n "$CHANGED" ]; then
            echo "has_changes=true" >> $GITHUB_OUTPUT
            echo "Python changes detected: $CHANGED"
          else
            echo "has_changes=false" >> $GITHUB_OUTPUT
            echo "No python/ changes detected, skipping sidecar build"
          fi
      - name: Configure git
        if: steps.check_changes.outputs.has_changes == 'true'
        run: |
          git config user.name "Gitea Actions"
          git config user.email "actions@gitea.local"
      - name: Bump sidecar patch version
        if: steps.check_changes.outputs.has_changes == 'true'
        id: bump
        run: |
          # Read current version from python/pyproject.toml
@@ -46,23 +66,6 @@ jobs:
          echo "version=${NEW_VERSION}" >> $GITHUB_OUTPUT
          echo "tag=sidecar-v${NEW_VERSION}" >> $GITHUB_OUTPUT
      - name: Check for python changes
        id: check_changes
        run: |
          # If triggered by workflow_dispatch, always build
          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
            echo "has_changes=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Check if any python/ files changed in this commit
          CHANGED=$(git diff --name-only HEAD~1 HEAD -- python/ || echo "")
          if [ -n "$CHANGED" ]; then
            echo "has_changes=true" >> $GITHUB_OUTPUT
          else
            echo "has_changes=false" >> $GITHUB_OUTPUT
            echo "No python/ changes detected, skipping sidecar build"
          fi
      - name: Commit and tag
        if: steps.check_changes.outputs.has_changes == 'true'
        env:
@@ -56,7 +56,12 @@ def _patch_pyannote_audio() -> None:
            return _sf_load(file["audio"])
        def _soundfile_crop(self, file: dict, segment, **kwargs) -> tuple:
-            """Replacement for Audio.crop — load full file then slice."""
+            """Replacement for Audio.crop — load full file then slice.
            Pads short segments with zeros to match the expected duration,
            which pyannote requires for batched embedding extraction.
            """
            duration = kwargs.get("duration", None)
            waveform, sample_rate = _sf_load(file["audio"])
            # Convert segment (seconds) to sample indices
            start_sample = int(segment.start * sample_rate)
@@ -65,6 +70,14 @@ def _patch_pyannote_audio() -> None:
            start_sample = max(0, start_sample)
            end_sample = min(waveform.shape[-1], end_sample)
            cropped = waveform[:, start_sample:end_sample]
            # Pad to expected duration if needed (pyannote batches require uniform size)
            if duration is not None:
                expected_samples = int(duration * sample_rate)
            else:
                expected_samples = int((segment.end - segment.start) * sample_rate)
            if cropped.shape[-1] < expected_samples:
                pad = torch.zeros(cropped.shape[0], expected_samples - cropped.shape[-1])
                cropped = torch.cat([cropped, pad], dim=-1)
            return cropped, sample_rate
        Audio.__call__ = _soundfile_call  # type: ignore[assignment]