fix: Implement distribute locker in Redis for cron jobs

2026-01-01 11:02:54 -06:00
parent ffd8ecd1d0
commit d8ea0c7297
6 changed files with 271 additions and 6 deletions
--- a/backend/src/features/backup/README.md
+++ b/backend/src/features/backup/README.md
@@ -111,7 +111,23 @@ Backups are stored in `/app/data/backups/` (mapped to `./data/backups/` on host)

 Jobs are registered in `backend/src/core/scheduler/index.ts`:
 - Backup check: Every minute
- Retention cleanup: Daily at 4 AM
+- Retention cleanup: Daily at 4 AM (also runs after each scheduled backup)
+
+### Distributed Locking
+
+Scheduled backups use Redis distributed locking to prevent duplicate backups when multiple backend containers are running (blue-green deployments).
+
+**Lock behavior:**
+- Lock key: `backup:schedule:{schedule_id}`
+- Lock TTL: 5 minutes (auto-release if container crashes)
+- Only one container creates the backup; others skip
+
+**Retention cleanup:**
+- Runs immediately after each successful scheduled backup
+- Deletes backups exceeding the schedule's retention count
+- Also runs globally at 4 AM daily as a safety net
+
+See `backend/src/core/scheduler/README.md` for the distributed locking pattern.

 ### Admin Routes

--- a/backend/src/features/backup/jobs/backup-scheduled.job.ts
+++ b/backend/src/features/backup/jobs/backup-scheduled.job.ts
@@ -1,12 +1,16 @@
 /**
 * @ai-summary Job for processing scheduled backups
 * @ai-context Runs every minute to check for due scheduled backups
+ * @ai-context Uses distributed locking to prevent duplicate backups in blue-green deployments
 */

 import { Pool } from 'pg';
+import { v4 as uuidv4 } from 'uuid';
 import { logger } from '../../../core/logging/logger';
+import { lockService } from '../../../core/config/redis';
 import { BackupRepository } from '../data/backup.repository';
 import { BackupService } from '../domain/backup.service';
+import { BackupRetentionService } from '../domain/backup-retention.service';
 import { ScheduledBackupJobResult, BackupFrequency } from '../domain/backup.types';

 let pool: Pool | null = null;
@@ -18,8 +22,12 @@ export function setBackupJobPool(dbPool: Pool): void {
  pool = dbPool;
 }

+// Lock TTL: 5 minutes (backup should complete well within this)
+const BACKUP_LOCK_TTL_SECONDS = 300;
+
 /**
 * Processes all scheduled backups that are due
+ * Uses distributed locking to prevent duplicate backups across containers
 */
 export async function processScheduledBackups(): Promise<ScheduledBackupJobResult> {
  if (!pool) {
@@ -28,6 +36,7 @@ export async function processScheduledBackups(): Promise<ScheduledBackupJobResul

  const repository = new BackupRepository(pool);
  const backupService = new BackupService(pool);
+  const retentionService = new BackupRetentionService(pool);

  const result: ScheduledBackupJobResult = {
    processed: 0,
@@ -48,6 +57,21 @@ export async function processScheduledBackups(): Promise<ScheduledBackupJobResul
    logger.info('Processing scheduled backups', { count: dueSchedules.length });

    for (const schedule of dueSchedules) {
+      // Generate unique lock value for this execution
+      const lockKey = `backup:schedule:${schedule.id}`;
+      const lockValue = uuidv4();
+
+      // Try to acquire lock for this schedule
+      const lockAcquired = await lockService.acquireLock(lockKey, BACKUP_LOCK_TTL_SECONDS, lockValue);
+
+      if (!lockAcquired) {
+        logger.debug('Backup already in progress for schedule, skipping', {
+          scheduleId: schedule.id,
+          scheduleName: schedule.name,
+        });
+        continue;
+      }
+
      result.processed++;

      try {
@@ -71,6 +95,28 @@ export async function processScheduledBackups(): Promise<ScheduledBackupJobResul
            scheduleId: schedule.id,
            backupId: backupResult.backupId,
          });
+
+          // Run retention cleanup for this schedule immediately after successful backup
+          try {
+            const retentionResult = await retentionService.cleanupScheduleBackups(
+              schedule.id,
+              schedule.name,
+              schedule.retentionCount
+            );
+            if (retentionResult.deletedCount > 0) {
+              logger.info('Retention cleanup completed after backup', {
+                scheduleId: schedule.id,
+                deletedCount: retentionResult.deletedCount,
+                freedBytes: retentionResult.freedBytes,
+              });
+            }
+          } catch (retentionError) {
+            logger.error('Retention cleanup failed after backup', {
+              scheduleId: schedule.id,
+              error: retentionError instanceof Error ? retentionError.message : String(retentionError),
+            });
+            // Don't fail the overall backup for retention errors
+          }
        } else {
          result.failed++;
          result.errors.push({
@@ -103,6 +149,9 @@ export async function processScheduledBackups(): Promise<ScheduledBackupJobResul
        } catch {
          // Ignore error updating next run
        }
+      } finally {
+        // Always release the lock
+        await lockService.releaseLock(lockKey, lockValue);
      }
    }