11/**
22 * @typedef {import('unist').Point } Point
3+ * @typedef {import('unist').Position } UnistPosition
4+ * @typedef {import('unist').Parent } UnistParent
35 *
46 * @typedef {import('nlcst').Root } NlcstRoot
57 * @typedef {import('nlcst').Content } NlcstContent
68 * @typedef {import('nlcst').SentenceContent } NlcstSentenceContent
79 * @typedef {import('nlcst').WhiteSpace } NlcstWhiteSpace
8- * @typedef {import('nlcst').Source } NlcstSource
9- * @typedef {NlcstRoot|NlcstContent } NlcstNode
10+ * @typedef {import('nlcst').Sentence } NlcstSentence
11+ * @typedef {import('nlcst').Paragraph } NlcstParagraph
1012 *
1113 * @typedef {import('mdast').Root } MdastRoot
1214 * @typedef {import('mdast').Content } MdastContent
13- * @typedef {MdastRoot|MdastContent } MdastNode
14- * @typedef {Extract<MdastNode, import('unist').Parent> } MdastParent
1515 *
1616 * @typedef {import('vfile').VFile } VFile
17+ *
1718 * @typedef {ReturnType<import('vfile-location').location> } Location
19+ */
20+
21+ /**
22+ * @typedef {MdastRoot | MdastContent } MdastNode
23+ * @typedef {NlcstRoot | NlcstContent } NlcstNode
24+ * @typedef {Extract<NlcstNode, UnistParent> } NlcstParent
25+ * @typedef {Extract<MdastNode, UnistParent> } MdastParent
26+ *
1827 * @typedef {{
19- * parse(nodes: Array<NlcstContent>): NlcstRoot
20- * tokenizeSource(value: string): NlcstSource
21- * tokenizeWhiteSpace(value: string): NlcstWhiteSpace
22- * tokenize(value: string): Array<NlcstSentenceContent>
28+ * tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
29+ * tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
30+ * tokenizeRootPlugins: Array<(node: NlcstRoot) => void>,
31+ * parse(value: string | null | undefined): NlcstRoot
32+ * tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
2333 * }} ParserInstance
2434 * @typedef {new () => ParserInstance } ParserConstructor
2535 *
@@ -44,6 +54,11 @@ import {location} from 'vfile-location'
4454const defaultIgnore = [ 'table' , 'tableRow' , 'tableCell' ]
4555const defaultSource = [ 'inlineCode' ]
4656
57+ // Ported from:
58+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
59+ const newLine = / ^ [ \t ] * ( ( \r ? \n | \r ) [ \t ] * ) + $ /
60+ const terminalMarker = / ^ ( [ ! . ? \u2026 \u203D ] + ) $ /
61+
4762/**
4863 * Transform a `tree` in mdast to nlcst.
4964 *
@@ -52,6 +67,7 @@ const defaultSource = ['inlineCode']
5267 * @param {ParserInstance|ParserConstructor } Parser
5368 * @param {Options } [options]
5469 */
70+ // eslint-disable-next-line complexity
5571export function toNlcst ( tree , file , Parser , options = { } ) {
5672 // Crash on invalid parameters.
5773 if ( ! tree || ! tree . type ) {
@@ -78,31 +94,78 @@ export function toNlcst(tree, file, Parser, options = {}) {
7894
7995 const parser = 'parse' in Parser ? Parser : new Parser ( )
8096
81- const result = one (
82- {
83- doc : String ( file ) ,
84- place : location ( file ) ,
85- parser,
86- ignore : options . ignore
87- ? defaultIgnore . concat ( options . ignore )
88- : defaultIgnore ,
89- source : options . source
90- ? defaultSource . concat ( options . source )
91- : defaultSource
92- } ,
93- tree
94- )
95-
96- // Transform mdast into nlcst tokens, and pass these into `parser.parse` to
97- // insert sentences, paragraphs where needed.
98- return parser . parse ( result || [ ] )
97+ /** @type {Context } */
98+ const context = {
99+ doc : String ( file ) ,
100+ place : location ( file ) ,
101+ parser,
102+ ignore : options . ignore
103+ ? defaultIgnore . concat ( options . ignore )
104+ : defaultIgnore ,
105+ source : options . source
106+ ? defaultSource . concat ( options . source )
107+ : defaultSource
108+ }
109+
110+ const result = one ( context , tree )
111+
112+ if ( result && result . length > 0 ) {
113+ const start = pointStart ( result [ 0 ] )
114+ const end = pointEnd ( result [ result . length - 1 ] )
115+
116+ // Turn into a sentence.
117+ /** @type {NlcstSentence } */
118+ const sentence = { type : 'SentenceNode' , children : result }
119+
120+ if ( start && start . line && end && end . line ) {
121+ sentence . position = { start, end}
122+ }
123+
124+ let index = - 1
125+ while ( parser . tokenizeSentencePlugins [ ++ index ] ) {
126+ parser . tokenizeSentencePlugins [ index ] ( sentence )
127+ }
128+
129+ // Turn into a paragraph.
130+ /** @type {NlcstParagraph } */
131+ const paragraph = {
132+ type : 'ParagraphNode' ,
133+ children : splitNode ( sentence , 'PunctuationNode' , terminalMarker )
134+ }
135+ if ( start && start . line && end && end . line ) {
136+ paragraph . position = { start : { ...start } , end : { ...end } }
137+ }
138+
139+ index = - 1
140+ while ( parser . tokenizeParagraphPlugins [ ++ index ] ) {
141+ parser . tokenizeParagraphPlugins [ index ] ( paragraph )
142+ }
143+
144+ /** @type {NlcstRoot } */
145+ const root = {
146+ type : 'RootNode' ,
147+ children : splitNode ( paragraph , 'WhiteSpaceNode' , newLine )
148+ }
149+ if ( start && start . line && end && end . line ) {
150+ root . position = { start : { ...start } , end : { ...end } }
151+ }
152+
153+ index = - 1
154+ while ( parser . tokenizeRootPlugins [ ++ index ] ) {
155+ parser . tokenizeRootPlugins [ index ] ( root )
156+ }
157+
158+ return root
159+ }
160+
161+ return { type : 'RootNode' , children : [ ] }
99162}
100163
101164/**
102165 * Transform a single node.
103166 * @param {Context } config
104167 * @param {MdastNode } node
105- * @returns {Array<NlcstContent >|undefined }
168+ * @returns {Array<NlcstSentenceContent >|undefined }
106169 */
107170function one ( config , node ) {
108171 const start = node . position ? node . position . start . offset : undefined
@@ -112,9 +175,10 @@ function one(config, node) {
112175 return patch (
113176 config ,
114177 [
115- config . parser . tokenizeSource (
116- config . doc . slice ( start , node . position . end . offset )
117- )
178+ {
179+ type : 'SourceNode' ,
180+ value : config . doc . slice ( start , node . position . end . offset )
181+ }
118182 ] ,
119183 start
120184 )
@@ -133,7 +197,7 @@ function one(config, node) {
133197 }
134198
135199 if ( node . type === 'break' ) {
136- return patch ( config , [ config . parser . tokenizeWhiteSpace ( ' \n') ] , start )
200+ return patch ( config , [ { type : 'WhiteSpaceNode' , value : ' \n'} ] , start )
137201 }
138202
139203 if ( node . type === 'text' ) {
@@ -146,11 +210,11 @@ function one(config, node) {
146210 * Transform all nodes in `parent`.
147211 * @param {Context } config
148212 * @param {MdastParent } parent
149- * @returns {Array<NlcstContent > }
213+ * @returns {Array<NlcstSentenceContent > }
150214 */
151215function all ( config , parent ) {
152216 let index = - 1
153- /** @type {Array<NlcstContent > } */
217+ /** @type {Array<NlcstSentenceContent > } */
154218 const results = [ ]
155219 /** @type {Point|undefined } */
156220 let end
@@ -165,9 +229,11 @@ function all(config, parent) {
165229 start . line !== null &&
166230 start . line !== end . line
167231 ) {
168- const lineEnding = config . parser . tokenizeWhiteSpace (
169- '\n' . repeat ( start . line - end . line )
170- )
232+ /** @type {NlcstWhiteSpace } */
233+ const lineEnding = {
234+ type : 'WhiteSpaceNode' ,
235+ value : '\n' . repeat ( start . line - end . line )
236+ }
171237 patch ( config , [ lineEnding ] , end . offset )
172238
173239 if ( lineEnding . value . length < 2 ) {
@@ -222,3 +288,51 @@ function patch(config, nodes, offset) {
222288
223289 return nodes
224290}
291+
292+ // Ported from:
293+ // <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
294+ /**
295+ * A function that splits one node into several nodes.
296+ *
297+ * @template {NlcstParent} TheNode
298+ * @param {TheNode } node
299+ * @param {RegExp } expression
300+ * @param {NlcstContent['type'] } childType
301+ * @returns {Array<TheNode> }
302+ */
303+ function splitNode ( node , childType , expression ) {
304+ /** @type {Array<TheNode> } */
305+ const result = [ ]
306+ let index = - 1
307+ let start = 0
308+
309+ while ( ++ index < node . children . length ) {
310+ const token = node . children [ index ]
311+
312+ if (
313+ index === node . children . length - 1 ||
314+ ( token . type === childType && expression . test ( toString ( token ) ) )
315+ ) {
316+ /** @type {TheNode } */
317+ // @ts -expect-error: fine
318+ const parent = {
319+ type : node . type ,
320+ children : node . children . slice ( start , index + 1 )
321+ }
322+
323+ const first = node . children [ start ]
324+ const last = token
325+ if ( first . position && last . position ) {
326+ parent . position = {
327+ start : first . position . start ,
328+ end : last . position . end
329+ }
330+ }
331+
332+ result . push ( parent )
333+ start = index + 1
334+ }
335+ }
336+
337+ return result
338+ }
0 commit comments