mark-parser/src/tokenize.mts at main · flex-development/mark-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/**
 * @file tokenize
 * @module mark-parser/tokenize
 */

import type {
  Tokenizable,
  TokenizeOptions
} from '@flex-development/mark-parser'
import { codes } from '@flex-development/mark-util-symbol'
import type {
  Context,
  Event,
  FileLike,
  TokenizeContext,
  Value
} from '@flex-development/mark/parse'
import isList from './internal/is-list.mts'
import nil from './internal/nil.mts'
import size from './internal/size.mts'
import decode from './utils/decode.mts'

export default tokenize

/**
 * Tokenize a `value`.
 *
 * @see {@linkcode Context}
 * @see {@linkcode Event}
 * @see {@linkcode Tokenizable}
 * @see {@linkcode TokenizeContext}
 * @see {@linkcode TokenizeOptions}
 *
 * @this {void}
 *
 * @param {Tokenizable | null | undefined} value
 *  The file, value, or list to tokenize
 * @param {Context | TokenizeContext} context
 *  The tokenizer to write to
 * @param {TokenizeOptions | null | undefined} [options]
 *  Options for tokenizing `value`
 * @return {Event[]}
 *  The list of events
 */
function tokenize(
  this: void,
  value: Tokenizable | null | undefined,
  context: Context | TokenizeContext,
  options?: TokenizeOptions | null | undefined
): Event[] {
  options ??= {}

  if (isList<FileLike | Value>(value)) {
    /**
     * The size of the list.
     *
     * @const {number} count
     */
    const count: number = size(value)

    for (const [index, chunk] of decode(value, context.encoding).entries()) {
      /**
       * Whether this is the end of the stream.
       *
       * @const {boolean} end
       */
      const end: boolean = index === count - 1

      // write to tokenizer or chunk again and write.
      if (chunk === codes.empty || !options.chunker) {
        context.write(chunk)
      } else {
        chunker(options.chunker, chunk)
      }

      // conditionally write stream break.
      if (options.breaks && !end) context.write(codes.break)
    }
  } else if (!nil(value)) {
    /**
     * The decoded chunk.
     *
     * @const {string | typeof codes.empty} decoded
     */
    const decoded: string | typeof codes.empty = decode(value, context.encoding)

    if (decoded === codes.empty) {
      context.write(decoded)
    } else if (options.chunker) {
      chunker(options.chunker, decoded)
    } else {
      context.write(context.preprocess(decoded, context.encoding))
    }
  }

  return context.write(codes.eos)

  /**
   * @this {void}
   *
   * @param {RegExp} pattern
   *  The regular expression used to create chunks
   * @param {string} input
   *  The string to chunk
   * @return {undefined}
   */
  function chunker(this: void, pattern: RegExp, input: string): undefined {
    /**
     * The index where the last match ends.
     *
     * @var {number} index
     */
    let index: number = 0

    // write chunks to the tokenizer.
    for (const { 0: match, index: start } of input.matchAll(pattern)) {
      /**
       * The index at which the match ends.
       *
       * @const {number} end
       */
      const end: number = start + match.length

      // text between matches.
      if (start > index) context.write(input.slice(index, start))

      // the match itself; write preprocessed match to stream.
      context.write(context.preprocess(match))
      index = end
    }

    // remaining tail text; write character code chunks to stream.
    index < input.length && context.write(input.slice(index))

    return void context
  }
}