1 module hunt.xml.DocumentParser; 2 3 import hunt.xml.Attribute; 4 import hunt.xml.Common; 5 import hunt.xml.Document; 6 import hunt.xml.Element; 7 import hunt.xml.Node; 8 import hunt.xml.Internal; 9 10 import hunt.logging.ConsoleLogger; 11 12 /** 13 * 14 */ 15 class DocumentParser { 16 17 /** 18 * 19 * Params: 20 * stext = 21 * null = 22 * Returns: 23 */ 24 static Document parse(ParsingFlags Flags = ParsingFlags.Default)(string stext , Document parent = null) 25 { 26 Document document = new Document(); 27 // document.removeAllNodes(); 28 // document.removeAllAttributes(); 29 document.setParent(parent ? parent.firstNode() : null); 30 char[] text = cast(char[])stext.dup; 31 32 parseBom(text); 33 34 size_t index = 0; 35 size_t length = text.length; 36 while(1) 37 { 38 skip!(WhitespacePred)(text); 39 if(text.length == 0) 40 break; 41 if(text[index] =='<') 42 { 43 ++index; 44 text = text[index .. $]; 45 Element node = parseNode!(Flags)(text); 46 if(node !is null) 47 { 48 document.appendNode(node); 49 if(Flags & (ParsingFlags.OpenOnly | ParsingFlags.ParseOne)) 50 { 51 if(node.getType() == NodeType.Comment) 52 break; 53 } 54 } 55 index=0; 56 } 57 else 58 throw new XmlParsingException("expected <", text); 59 } 60 61 if(!document.firstNode()) 62 throw new XmlParsingException("no root element", text[index .. $ ]); 63 64 return document; 65 } 66 67 static private Element parseNode(int Flags)(ref char[] text) 68 { 69 switch(text[0]) 70 { 71 // <... 72 default: 73 return parseElement!Flags(text); 74 75 // <?... 76 case '?': 77 text = text[1 .. $ ]; 78 if(((text[0] == 'x' ) || (text[0] == 'X')) && 79 ((text[1] == 'm' ) || (text[1] == 'M')) && 80 ((text[2] == 'l' ) || (text[2] == 'L')) && 81 WhitespacePred.test(text[3])) 82 { 83 text = text[4 .. $]; 84 return parseXmlDeclaration!Flags(text); 85 } 86 else 87 { 88 return parsePI!Flags(text); 89 } 90 91 case '!': 92 switch(text[1]) 93 { 94 case '-': 95 if(text[2] == '-') 96 { 97 text = text[3 .. $ ]; 98 return parseComment!Flags(text); 99 } 100 break; 101 case ('['): 102 if (text[2] == ('C') && text[3] == ('D') && text[4] == ('A') && 103 text[5] == ('T') && text[6] == ('A') && text[7] == ('[')) 104 { 105 // '<![CDATA[' - cdata 106 text = text[8 .. $ ]; // Skip '![CDATA[' 107 return parseCdata!Flags(text); 108 } 109 break; 110 111 // <!D 112 case ('D'): 113 if (text[2] == ('O') && text[3] == ('C') && text[4] == ('T') && 114 text[5] == ('Y') && text[6] == ('P') && text[7] == ('E') && 115 WhitespacePred.test(text[8])) 116 { 117 // '<!DOCTYPE ' - doctype 118 text = text[9 .. $ ]; // skip '!DOCTYPE ' 119 return parseDoctype!Flags(text); 120 } 121 break; 122 default: 123 break; 124 125 } 126 127 text = text[1 .. $ ]; // Skip ! 128 while (text[0] != ('>')) 129 { 130 if (text == null) 131 throw new XmlParsingException("unexpected end of data", text); 132 text = text[1 .. $ ]; 133 } 134 text = text[1 .. $ ]; // Skip '>' 135 return null; // No node recognized 136 137 } 138 } 139 140 static private Element parseCdata(int Flags)(ref char[] text) 141 { 142 // If CDATA is disabled 143 if (Flags & ParsingFlags.DataNodes) 144 { 145 // Skip until end of cdata 146 while (text[0] != ']' || text[1] != ']' || text[2] != '>') 147 { 148 if (!text[0]) 149 throw new XmlParsingException("unexpected end of data", text); 150 text = text[1 .. $]; 151 } 152 text = text[3 .. $]; // Skip ]]> 153 return null; // Do not produce CDATA node 154 } 155 156 // Skip until end of cdata 157 char[] value = text; 158 while (text[0] != (']') || text[1] != (']') || text[2] != ('>')) 159 { 160 if (!text[0]) 161 throw new XmlParsingException("unexpected end of data", text); 162 text = text[1 .. $ ]; 163 } 164 165 // Create new cdata node 166 Element cdata = new Element(NodeType.CDATA); 167 cdata.setText = cast(string)value[ 0 .. value.length - text.length].dup; 168 169 // Place zero terminator after value 170 171 text = text[3 .. $ ]; // Skip ]]> 172 return cdata; 173 } 174 175 static private char parseAndAppendData(int Flags)(Element node, ref char []text, char[] contents_start) 176 { 177 // Backup to contents start if whitespace trimming is disabled 178 if (!(Flags & ParsingFlags.TrimWhitespace)) 179 text = contents_start; 180 181 // Skip until end of data 182 char [] value = text; 183 char []end; 184 if (Flags & ParsingFlags.NormalizeWhitespace) 185 end = skipAndExpandCharacterRefs!(TextPred, TextPureWithWsPred, Flags)(text); 186 else 187 end = skipAndExpandCharacterRefs!(TextPred, TextPureNoWsPred, Flags)(text); 188 189 // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after > 190 if (Flags & ParsingFlags.TrimWhitespace) 191 { 192 // FIXME: Needing refactor or cleanup -@zhangxueping at 2021-04-01T19:53:47+08:00 193 // 194 if (Flags & ParsingFlags.NormalizeWhitespace) 195 { 196 // Whitespace is already condensed to single space characters by skipping function, so just trim 1 char off the end 197 if (end[-1] == ' ') 198 end = end[-1 .. $]; 199 } 200 else 201 { 202 // Backup until non-whitespace character is found 203 while (WhitespacePred.test(end[-1])) 204 end = end[-1 .. $ - 1]; 205 } 206 } 207 208 // If characters are still left between end and value (this test is only necessary if normalization is enabled) 209 // Create new data node 210 if (!(Flags & ParsingFlags.DataNodes)) 211 { 212 Element data = new Element(NodeType.Text); 213 data.setText = cast(string)value[0 .. value.length - end.length].dup; 214 node.appendNode(data); 215 } 216 217 // Add data to parent node if no data exists yet 218 if (!(Flags & ParsingFlags.EelementValues)) 219 if (node.getText.length == 0) 220 node.setText = cast(string)value[0 ..value.length - end.length]; 221 222 // Place zero terminator after value 223 if (!(Flags & ParsingFlags.StringTerminators)) 224 { 225 ubyte ch = text[0]; 226 end[0] ='\0'; 227 return ch; // Return character that ends data; this is required because zero terminator overwritten it 228 } 229 else 230 // Return character that ends data 231 return text[0]; 232 } 233 234 static private Element parseElement(int Flags)(ref char[] text) 235 { 236 Element element = new Element(); 237 char[] prefix = text; 238 //skip ElementNamePred 239 skip!(ElementNamePred)(text); 240 if(text == prefix) 241 throw new XmlParsingException("expected element name or prefix", text); 242 if(text.length >0 && text[0] == ':') 243 { 244 element.namespacePrefix = prefix[0 .. prefix.length - text.length].dup; 245 text = text[1 .. $ ]; 246 char[] name = text; 247 //skip NodeNamePred 248 skip!(NodeNamePred)(text); 249 if(text == name) 250 throw new XmlParsingException("expected element local name", text); 251 element.setName = name[0 .. name.length - text.length].dup; 252 } 253 else{ 254 element.setName = prefix[ 0 .. prefix.length - text.length].dup; 255 } 256 257 //skip WhitespacePred 258 skip!(WhitespacePred)(text); 259 parseNodeAttributes!(Flags)(text , element); 260 if(text.length > 0 && text[0] == '>') 261 { 262 text = text[1 .. $]; 263 char[] contents = text; 264 char[] contents_end = null; 265 if(!(Flags & ParsingFlags.OpenOnly)) 266 { 267 contents_end = parseNodeContents!(Flags)(text , element); 268 } 269 if(contents_end.length != contents.length ) 270 { 271 element.contents = cast(string)contents[0 .. contents.length - contents_end.length].dup; 272 } 273 } 274 else if(text.length > 0 && text[0] == '/') 275 { 276 text = text[1 .. $ ]; 277 if(text[0] != '>') 278 throw new XmlParsingException("expected >", text); 279 280 text = text[1 .. $ ]; 281 282 if(Flags & ParsingFlags.OpenOnly) 283 throw new XmlParsingException("open_only, but closed", text); 284 } 285 else 286 throw new XmlParsingException("expected >", text); 287 // Place zero terminator after name 288 // no need. 289 return element; 290 } 291 292 static private char[] parseNodeContents(int Flags)(ref char[] text , Element node) 293 { 294 char[] retval; 295 while(1) 296 { 297 char[] contents_start = text; 298 skip!(WhitespacePred)(text); 299 char next_char = text[0]; 300 301 after_data_node: 302 303 switch(next_char) 304 { 305 case '<': 306 if(text[1] == '/') 307 { 308 retval = text; 309 text = text[2 .. $ ]; 310 if(Flags & ParsingFlags.ValidateClosingTags) 311 { 312 string closing_name = cast(string)text.dup; 313 skip!(NodeNamePred)(text); 314 if(closing_name == node.getName) 315 throw new XmlParsingException("invalid closing tag name", text); 316 } 317 else 318 { 319 skip!(NodeNamePred)(text); 320 } 321 322 skip!(WhitespacePred)(text); 323 if(text[0] != '>') 324 throw new XmlParsingException("expected >", text); 325 text = text[1 .. $]; 326 if(Flags & ParsingFlags.OpenOnly) 327 throw new XmlParsingException("Unclosed element actually closed.", text); 328 329 return retval; 330 } 331 else 332 { 333 text = text[1 .. $ ]; 334 if(Element child = parseNode!(Flags & ~ParsingFlags.OpenOnly)(text)) 335 node.appendNode(child); 336 } 337 break; 338 default: 339 next_char = parseAndAppendData!(Flags)(node, text, contents_start); 340 goto after_data_node; // Bypass regular processing after data nodes 341 } 342 } 343 344 return null; 345 } 346 347 static private void parseNodeAttributes(int Flags)(ref char[] text , Element node) 348 { 349 int index = 0; 350 351 while(text.length > 0 && AttributeNamePred.test(text[0])) 352 { 353 char[] name = text; 354 text = text[1 .. $ ]; 355 skip!(AttributeNamePred)(text); 356 if(text == name) 357 throw new XmlParsingException("expected attribute name", name); 358 359 Attribute attribute = new Attribute(); 360 attribute.setName = cast(string)name[0 .. name.length - text.length].dup; 361 362 node.appendAttribute(attribute); 363 364 skip!(WhitespacePred)(text); 365 366 if(text.length ==0 || text[0] != '=') 367 throw new XmlParsingException("expected =", text); 368 369 text = text[1 .. $ ]; 370 371 skip!(WhitespacePred)(text); 372 373 char quote = text[0]; 374 if(quote != '\'' && quote != '"') 375 throw new XmlParsingException("expected ' or \"", text); 376 377 text = text[1 .. $ ]; 378 char[] value = text ; 379 char[] end; 380 const int AttFlags = Flags & ~ParsingFlags.NormalizeWhitespace; 381 382 if(quote == '\'') 383 end = skipAndExpandCharacterRefs!(AttributeValuePred!'\'' , AttributeValuePurePred!('\'') , AttFlags)(text); 384 else 385 end = skipAndExpandCharacterRefs!(AttributeValuePred!('"') , AttributeValuePurePred!('"') , AttFlags)(text); 386 387 attribute.setValue = cast(string)value[0 .. value.length - end.length].dup; 388 389 if(text.length > 0 && text[0] != quote) 390 throw new XmlParsingException("expected ' or \"", text); 391 392 text = text[1 .. $ ]; 393 394 skip!(WhitespacePred)(text); 395 } 396 } 397 398 399 static private void parseBom(ref char[] text) 400 { 401 if(text[0] == 0xEF 402 && text[1] == 0xBB 403 && text[2] == 0xBF) 404 { 405 text = text[3 .. $ ]; 406 } 407 } 408 409 static private Element parseXmlDeclaration(int Flags)(ref char[] text) 410 { 411 static if (Flags & ParsingFlags.DeclarationNode) { 412 // Create declaration 413 Element declaration = new Element(NodeType.Declaration); 414 415 // Skip whitespace before attributes or ?> 416 skip!WhitespacePred(text); 417 // Parse declaration attributes 418 parseNodeAttributes!Flags(text, declaration); 419 420 // Skip ?> 421 if (text[0] != '?' || text[1] != '>') 422 throw new XmlParsingException("expected ?>", text); 423 text = text[2 .. $ ]; 424 425 return declaration; 426 } else { 427 // If parsing of declaration is disabled 428 // Skip until end of declaration 429 while (text[0] != '?' || text[1] != '>') 430 { 431 if (!text[0]) 432 throw new XmlParsingException("unexpected end of data", text); 433 text = text[1 .. $ ]; 434 } 435 text = text[2 .. $ ]; // Skip '?>' 436 return null; 437 } 438 } 439 440 static private Element parsePI(int Flags)(ref char[] text) 441 { 442 // If creation of PI nodes is enabled 443 if (Flags & ParsingFlags.PiNodes) 444 { 445 // Create pi node 446 Element pi = new Element(NodeType.ProcessingInstruction); 447 448 // Extract PI target name 449 char[] name = text; 450 skip!NodeNamePred(text); 451 if (text == name) 452 throw new XmlParsingException("expected PI target", text); 453 pi.setName = cast(string)name[0 .. name.length - text.length].dup; 454 455 // Skip whitespace between pi target and pi 456 skip!WhitespacePred(text); 457 458 // Remember start of pi 459 char[] value = text; 460 461 // Skip to '?>' 462 while (text[0] != '?' || text[1] != '>') 463 { 464 if (text == null) 465 throw new XmlParsingException("unexpected end of data", text); 466 text = text[1 .. $ ]; 467 } 468 469 // Set pi value (verbatim, no entity expansion or whitespace normalization) 470 pi.setText = cast(string)value[ 0 .. value.length - text.length ].dup; 471 472 // Place zero terminator after name and value 473 // no need 474 475 text = text[2 .. $ ]; // Skip '?>' 476 return pi; 477 } 478 else 479 { 480 // Skip to '?>' 481 while (text[0] != '?' || text[1] != '>') 482 { 483 if (text[0] == '\0') 484 throw new XmlParsingException("unexpected end of data", text); 485 text = text[1 .. $ ]; 486 } 487 text = text[2 .. $ ]; // Skip '?>' 488 return null; 489 } 490 } 491 492 static private Element parseComment(int Flags)(ref char[] text) 493 { 494 static if (Flags & ParsingFlags.CommentNodes) { 495 // Remember value start 496 auto value = text; 497 498 // Skip until end of comment 499 while (text[0] != '-' || text[1] != '-' || text[2] != '>') 500 { 501 if (!text[0]) throw new XmlParsingException("unexpected end of data", text); 502 text= text[1 .. $]; 503 } 504 505 // Create comment node 506 Element comment = new Element(NodeType.Comment); 507 comment.setText = cast(string)value[0 .. value.length - text.length].dup; 508 509 // Place zero terminator after comment value 510 // no need 511 512 text = text[3 .. $ ]; // Skip '-->' 513 return comment; 514 } else { 515 // If parsing of comments is disabled 516 // Skip until end of comment 517 while (text[0] != '-' || text[1] != '-' || text[2] != '>') 518 { 519 if (!text[0]) throw new XmlParsingException("unexpected end of data", text); 520 text = text[1 .. $]; 521 } 522 text = text [3 .. $]; // Skip '-->' 523 return null; // Do not produce comment node 524 } 525 526 527 } 528 529 // Parse DOCTYPE 530 531 static private Element parseDoctype(int Flags)(ref char[] text) 532 { 533 // Remember value start 534 char[] value = text; 535 536 // Skip to > 537 while (text[0] != '>') 538 { 539 // Determine character type 540 switch (text[0]) 541 { 542 543 // If '[' encountered, scan for matching ending ']' using naive algorithm with depth 544 // This works for all W3C test files except for 2 most wicked 545 case ('['): 546 { 547 text = text[1 .. $ ]; // Skip '[' 548 int depth = 1; 549 while (depth > 0) 550 { 551 switch (text[0]) 552 { 553 case '[': ++depth; break; 554 case ']': --depth; break; 555 default : throw new XmlParsingException("unexpected end of data", text); 556 } 557 text = text[1 .. $]; 558 } 559 break; 560 } 561 562 // Error on end of text 563 case '\0': 564 throw new XmlParsingException("unexpected end of data", text); 565 566 // Other character, skip it 567 default: 568 text = text[1 .. $ ]; 569 570 } 571 } 572 573 // If DOCTYPE nodes enabled 574 if (Flags & ParsingFlags.DoctypeNode) 575 { 576 // Create a new doctype node 577 Element doctype = new Element(NodeType.DocumentType); 578 doctype.setText = cast(string)value[ 0 .. value.length - text.length].dup; 579 580 // Place zero terminator after value 581 // no need 582 583 text = text[1 .. $ ]; // skip '>' 584 return doctype; 585 } 586 else 587 { 588 text = text[1 .. $ ]; // skip '>' 589 return null; 590 } 591 } 592 593 }