socket-lib/src/archives.ts at main · SocketDev/socket-lib · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
/**
 * @fileoverview Generic archive extraction utilities.
 * Supports zip, tar, tar.gz, and tgz formats.
 */

import { createReadStream, existsSync } from 'node:fs'
import process from 'node:process'
import { pipeline } from 'node:stream/promises'
import { createGunzip } from 'node:zlib'

import { safeMkdir } from './fs'
import { normalizePath } from './paths/normalize'

import type AdmZipType from './external/adm-zip'
import type tarFsType from './external/tar-fs'

import {
  ArrayFrom,
  ArrayPrototypeSlice,
  ErrorCtor,
  PromiseAll,
  SetCtor,
  StringPrototypeEndsWith,
  StringPrototypeStartsWith,
} from './primordials'

/**
 * Archive format type.
 */
export type ArchiveFormat = 'tar' | 'tar.gz' | 'tgz' | 'zip'

/**
 * Options for archive extraction.
 */
export interface ExtractOptions {
  /** Suppress log messages */
  quiet?: boolean | undefined
  /** Strip leading path components (like tar --strip-components) */
  strip?: number | undefined
  /** Maximum number of entries to extract (default: 100,000) */
  maxEntries?: number | undefined
  /** Maximum size of a single extracted file in bytes (default: 100MB) */
  maxFileSize?: number | undefined
  /** Maximum total extracted size in bytes (default: 1GB) */
  maxTotalSize?: number | undefined
}

/**
 * Default extraction limits to prevent zip bombs and DoS attacks.
 */
// 100MB
const DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024
// 1GB
const DEFAULT_MAX_TOTAL_SIZE = 1024 * 1024 * 1024
// Maximum number of entries to prevent inode exhaustion DoS.
const DEFAULT_MAX_ENTRIES = 100_000

let _AdmZip: typeof AdmZipType | undefined
let _tarFs: typeof tarFsType | undefined
let _path: typeof import('node:path') | undefined

/*@__NO_SIDE_EFFECTS__*/
function getAdmZip() {
  if (_AdmZip === undefined) {
    _AdmZip = /*@__PURE__*/ require('./external/adm-zip.js')
  }
  return _AdmZip!
}

/**
 * Lazily load the path module to avoid Webpack errors.
 *
 * @private
 */
/*@__NO_SIDE_EFFECTS__*/
function getPath() {
  if (_path === undefined) {
    _path = /*@__PURE__*/ require('node:path')
  }
  return _path as typeof import('node:path')
}

/*@__NO_SIDE_EFFECTS__*/
function getTarFs() {
  if (_tarFs === undefined) {
    _tarFs = /*@__PURE__*/ require('./external/tar-fs.js')
  }
  return _tarFs!
}

/**
 * Validate that a resolved path is within the target directory.
 * Prevents path traversal attacks.
 *
 * @param targetPath - The resolved path to validate
 * @param baseDir - The base directory that should contain the path
 * @param entryName - Original entry name for error reporting
 * @throws Error if path is outside the base directory
 * @private
 */
function validatePathWithinBase(
  targetPath: string,
  baseDir: string,
  entryName: string,
): void {
  const path = getPath()
  const resolvedTarget = path.resolve(targetPath)
  const resolvedBase = path.resolve(baseDir)

  // Ensure target path starts with base directory + separator
  // This prevents attacks like /base/dir vs /base/dir-evil
  if (
    !StringPrototypeStartsWith(resolvedTarget, resolvedBase + path.sep) &&
    resolvedTarget !== resolvedBase
  ) {
    throw new ErrorCtor(
      `Path traversal attempt detected: entry "${entryName}" would extract to "${resolvedTarget}" outside target directory "${resolvedBase}"`,
    )
  }
}

/**
 * Assert that an archive file exists on disk before handing it to the
 * underlying extractor. Normalizes the "missing archive" surface across
 * all three extractors (zip/tar/tar.gz): each now throws a Node-style
 * `ENOENT` error with the archive path. Without this preflight, `zip`
 * goes through adm-zip and surfaces as `"Invalid filename"`, while
 * `tar`/`tar.gz` surface the raw Node `ENOENT` — inconsistent, and
 * adm-zip's message didn't include the path.
 *
 * @throws Error with `code: 'ENOENT'` if archivePath doesn't exist.
 * @private
 */
function assertArchiveExists(archivePath: string): void {
  if (!existsSync(archivePath)) {
    const err = new ErrorCtor(
      `ENOENT: no such file or directory, open '${archivePath}'`,
    ) as Error & { code: string; path: string }
    err.code = 'ENOENT'
    err.path = archivePath
    throw err
  }
}

/**
 * Detect archive format from file path.
 *
 * @param filePath - Path to archive file
 * @returns Archive format or null if unknown
 *
 * @example
 * ```typescript
 * detectArchiveFormat('package.tar.gz')  // 'tar.gz'
 * detectArchiveFormat('archive.zip')     // 'zip'
 * detectArchiveFormat('data.csv')        // null
 * ```
 */
export function detectArchiveFormat(filePath: string): ArchiveFormat | null {
  const lower = filePath.toLowerCase()
  if (StringPrototypeEndsWith(lower, '.tar.gz')) {
    return 'tar.gz'
  }
  if (StringPrototypeEndsWith(lower, '.tgz')) {
    return 'tgz'
  }
  if (StringPrototypeEndsWith(lower, '.tar')) {
    return 'tar'
  }
  if (StringPrototypeEndsWith(lower, '.zip')) {
    return 'zip'
  }
  return null
}

/**
 * Extract an archive to a directory.
 * Automatically detects format from file extension.
 *
 * @param archivePath - Path to archive file
 * @param outputDir - Directory to extract to
 * @param options - Extraction options
 * @throws Error if archive format is not supported
 *
 * @example
 * ```typescript
 * await extractArchive('/tmp/package.tar.gz', '/tmp/output')
 * await extractArchive('/tmp/release.zip', '/tmp/output', { strip: 1 })
 * ```
 */
export async function extractArchive(
  archivePath: string,
  outputDir: string,
  options: ExtractOptions = {},
): Promise<void> {
  const format = detectArchiveFormat(archivePath)

  if (!format) {
    const path = getPath()
    const ext = path.extname(archivePath).toLowerCase()
    throw new ErrorCtor(
      `Unsupported archive format${ext ? ` (extension: ${ext})` : ''}: ${archivePath}. ` +
        'Supported formats: .zip, .tar, .tar.gz, .tgz',
    )
  }

  switch (format) {
    case 'zip':
      return await extractZip(archivePath, outputDir, options)
    case 'tar':
      return await extractTar(archivePath, outputDir, options)
    case 'tar.gz':
    case 'tgz':
      return await extractTarGz(archivePath, outputDir, options)
  }
}

/**
 * Extract a tar archive to a directory.
 *
 * @param archivePath - Path to tar file
 * @param outputDir - Directory to extract to
 * @param options - Extraction options
 *
 * @example
 * ```typescript
 * await extractTar('/tmp/archive.tar', '/tmp/output')
 * await extractTar('/tmp/archive.tar', '/tmp/output', { strip: 1 })
 * ```
 */
export async function extractTar(
  archivePath: string,
  outputDir: string,
  options: ExtractOptions = {},
): Promise<void> {
  // Normalize the "missing archive" surface (see extractZip) — throw
  // ENOENT up front with a clear message rather than letting the
  // Node-level createReadStream eventually surface as a stream error.
  assertArchiveExists(archivePath)

  const {
    maxEntries = DEFAULT_MAX_ENTRIES,
    maxFileSize = DEFAULT_MAX_FILE_SIZE,
    maxTotalSize = DEFAULT_MAX_TOTAL_SIZE,
    strip = 0,
  } = options

  // Normalize output directory path for cross-platform compatibility
  const normalizedOutputDir = normalizePath(outputDir)
  await safeMkdir(normalizedOutputDir)

  let totalExtractedSize = 0
  let entryCount = 0

  let destroyScheduled = false

  const tarFs = getTarFs()
  const extractStream = tarFs.extract(normalizedOutputDir, {
    map: (header: { name: string; size?: number; type?: string }) => {
      // Skip if destroy already scheduled
      if (destroyScheduled) {
        return header
      }

      // Check entry count to prevent inode exhaustion DoS.
      entryCount += 1
      if (entryCount > maxEntries) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Archive has too many entries: exceeded limit of ${maxEntries}`,
            ),
          )
        })
        return header
      }

      // Reject entries with null bytes in names (defense in depth).
      if (header.name.includes('\0')) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Invalid null byte in archive entry name: ${header.name}`,
            ),
          )
        })
        return header
      }

      // Check for symlinks
      if (header.type === 'symlink' || header.type === 'link') {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Symlink detected in archive: ${header.name}. Symlinks are not supported for security reasons.`,
            ),
          )
        })
        return header
      }

      // Check individual file size
      if (header.size && header.size > maxFileSize) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `File size exceeds limit: ${header.name} (${header.size} bytes > ${maxFileSize} bytes)`,
            ),
          )
        })
        return header
      }

      // Check total extracted size
      if (header.size) {
        totalExtractedSize += header.size
        if (totalExtractedSize > maxTotalSize) {
          destroyScheduled = true
          process.nextTick(() => {
            extractStream.destroy(
              new Error(
                `Total extracted size exceeds limit: ${totalExtractedSize} bytes > ${maxTotalSize} bytes`,
              ),
            )
          })
          return header
        }
      }

      return header
    },
    strip,
  })

  // Attach error handler before starting pipeline to catch errors
  extractStream.on('error', () => {
    // Error will be caught by pipeline
  })

  const readStream = createReadStream(archivePath)

  try {
    await pipeline(readStream, extractStream)
  } catch (e) {
    // Ensure stream is cleaned up on error
    readStream.destroy()
    throw e
  }
}

/**
 * Extract a gzipped tar archive to a directory.
 *
 * @param archivePath - Path to tar.gz or tgz file
 * @param outputDir - Directory to extract to
 * @param options - Extraction options
 *
 * @example
 * ```typescript
 * await extractTarGz('/tmp/archive.tar.gz', '/tmp/output')
 * await extractTarGz('/tmp/archive.tgz', '/tmp/output', { strip: 1 })
 * ```
 */
export async function extractTarGz(
  archivePath: string,
  outputDir: string,
  options: ExtractOptions = {},
): Promise<void> {
  // Normalize the "missing archive" surface (see extractZip).
  assertArchiveExists(archivePath)

  const {
    maxEntries = DEFAULT_MAX_ENTRIES,
    maxFileSize = DEFAULT_MAX_FILE_SIZE,
    maxTotalSize = DEFAULT_MAX_TOTAL_SIZE,
    strip = 0,
  } = options

  // Normalize output directory path for cross-platform compatibility
  const normalizedOutputDir = normalizePath(outputDir)
  await safeMkdir(normalizedOutputDir)

  let totalExtractedSize = 0
  let entryCount = 0

  let destroyScheduled = false

  const tarFs = getTarFs()
  const extractStream = tarFs.extract(normalizedOutputDir, {
    map: (header: { name: string; size?: number; type?: string }) => {
      // Skip if destroy already scheduled
      if (destroyScheduled) {
        return header
      }

      // Check entry count to prevent inode exhaustion DoS.
      entryCount += 1
      if (entryCount > maxEntries) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Archive has too many entries: exceeded limit of ${maxEntries}`,
            ),
          )
        })
        return header
      }

      // Reject entries with null bytes in names (defense in depth).
      if (header.name.includes('\0')) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Invalid null byte in archive entry name: ${header.name}`,
            ),
          )
        })
        return header
      }

      // Check for symlinks
      if (header.type === 'symlink' || header.type === 'link') {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `Symlink detected in archive: ${header.name}. Symlinks are not supported for security reasons.`,
            ),
          )
        })
        return header
      }

      // Check individual file size
      if (header.size && header.size > maxFileSize) {
        destroyScheduled = true
        process.nextTick(() => {
          extractStream.destroy(
            new Error(
              `File size exceeds limit: ${header.name} (${header.size} bytes > ${maxFileSize} bytes)`,
            ),
          )
        })
        return header
      }

      // Check total extracted size
      if (header.size) {
        totalExtractedSize += header.size
        if (totalExtractedSize > maxTotalSize) {
          destroyScheduled = true
          process.nextTick(() => {
            extractStream.destroy(
              new Error(
                `Total extracted size exceeds limit: ${totalExtractedSize} bytes > ${maxTotalSize} bytes`,
              ),
            )
          })
          return header
        }
      }

      return header
    },
    strip,
  })

  // Attach error handler before starting pipeline to catch errors
  extractStream.on('error', () => {
    // Error will be caught by pipeline
  })

  const readStream = createReadStream(archivePath)

  try {
    await pipeline(readStream, createGunzip(), extractStream)
  } catch (e) {
    // Ensure stream is cleaned up on error
    readStream.destroy()
    throw e
  }
}

/**
 * Extract a zip archive to a directory.
 *
 * @param archivePath - Path to zip file
 * @param outputDir - Directory to extract to
 * @param options - Extraction options
 *
 * @example
 * ```typescript
 * await extractZip('/tmp/archive.zip', '/tmp/output')
 * await extractZip('/tmp/archive.zip', '/tmp/output', { strip: 1 })
 * ```
 */
export async function extractZip(
  archivePath: string,
  outputDir: string,
  options: ExtractOptions = {},
): Promise<void> {
  // Normalize the "missing archive" surface — throws ENOENT before
  // AdmZip can surface its generic "Invalid filename" message.
  assertArchiveExists(archivePath)

  const {
    maxEntries = DEFAULT_MAX_ENTRIES,
    maxFileSize = DEFAULT_MAX_FILE_SIZE,
    maxTotalSize = DEFAULT_MAX_TOTAL_SIZE,
    strip = 0,
  } = options

  // Normalize output directory path for cross-platform compatibility
  const normalizedOutputDir = normalizePath(outputDir)
  await safeMkdir(normalizedOutputDir)

  const AdmZip = getAdmZip()
  const zip = new AdmZip(archivePath)
  const path = getPath()

  // Pre-validate all entries for security
  const entries = zip.getEntries()

  // Check entry count to prevent inode exhaustion DoS.
  if (entries.length > maxEntries) {
    throw new ErrorCtor(
      `Archive has too many entries: ${entries.length} (limit: ${maxEntries})`,
    )
  }

  let totalExtractedSize = 0

  for (const entry of entries) {
    if (entry.isDirectory) {
      continue
    }

    // Reject entries with null bytes in names (defense in depth).
    if (entry.entryName.includes('\0')) {
      throw new ErrorCtor(
        `Invalid null byte in archive entry name: ${entry.entryName}`,
      )
    }

    // Check individual file size
    const uncompressedSize = entry.header.size
    if (uncompressedSize > maxFileSize) {
      throw new ErrorCtor(
        `File size exceeds limit: ${entry.entryName} (${uncompressedSize} bytes > ${maxFileSize} bytes)`,
      )
    }

    // Check total extracted size
    totalExtractedSize += uncompressedSize
    if (totalExtractedSize > maxTotalSize) {
      throw new ErrorCtor(
        `Total extracted size exceeds limit: ${totalExtractedSize} bytes > ${maxTotalSize} bytes`,
      )
    }

    // ZIP entries always use forward slashes per ZIP specification
    const parts = entry.entryName.split('/')
    if (parts.length <= strip) {
      continue
    }

    const strippedPath = ArrayPrototypeSlice(parts, strip).join('/')
    const targetPath = path.join(normalizedOutputDir, strippedPath)

    // Validate path is within target directory (prevents path traversal)
    validatePathWithinBase(targetPath, normalizedOutputDir, entry.entryName)
  }

  if (strip === 0) {
    // Simple case: extract everything as-is
    // Even without strip, validate paths
    for (const entry of entries) {
      if (!entry.isDirectory) {
        const targetPath = path.join(normalizedOutputDir, entry.entryName)
        validatePathWithinBase(targetPath, normalizedOutputDir, entry.entryName)
      }
    }

    zip.extractAllTo(normalizedOutputDir, true)
  } else {
    // Strip leading path components
    const path = getPath()
    const entries = zip.getEntries()

    // Collect all directories we need to create
    const dirsToCreate = new SetCtor<string>()
    for (const entry of entries) {
      if (entry.isDirectory) {
        continue
      }

      // ZIP entries always use forward slashes per ZIP specification
      const parts = entry.entryName.split('/')
      if (parts.length <= strip) {
        continue
      }

      const strippedPath = ArrayPrototypeSlice(parts, strip).join('/')
      const targetPath = path.join(normalizedOutputDir, strippedPath)
      dirsToCreate.add(path.dirname(targetPath))
    }

    // Create all directories
    await PromiseAll(ArrayFrom(dirsToCreate).map(dir => safeMkdir(dir)))

    // Extract all files (synchronous operation)
    for (const entry of entries) {
      if (entry.isDirectory) {
        continue
      }

      // ZIP entries always use forward slashes per ZIP specification
      const parts = entry.entryName.split('/')
      if (parts.length <= strip) {
        continue
      }

      const strippedPath = ArrayPrototypeSlice(parts, strip).join('/')
      const targetPath = path.join(normalizedOutputDir, strippedPath)

      // Extract file
      zip.extractEntryTo(entry, path.dirname(targetPath), false, true)
    }
  }
}