| 321 | * Parse any string into separate chunks of plain text, HTML, and shortcodes. |
| 322 | * |
| 323 | * The return value is a 2-dimensional array of strings and metadata ordered |
| 324 | * by input, similar to a split string. Each node has these keys and values: |
| 325 | * |
| 326 | * key- type - value |
| 327 | * ------------------ |
| 328 | * 0 - string - The text that was split into this node from the input. |
| 329 | * 1 - int - Start position of input. |
| 330 | * 2 - int - End position of input. |
| 331 | * 3 - int - Length of text. Always equal to End - Start + 1. |
| 332 | * 4 - bool - Shortcode flag. |
| 333 | * 5 - bool - HTML flag. |
| 334 | * 6 - Reserved for future use to indicate the node is inline or block. |
| 335 | * |
| 336 | * The string in key 0 is plain text when keys 4 and 5 are both false. |
| 337 | * |
| 338 | * @since 4.0.1 |
| 339 | * @param string $text The user input that needs to be texturized. |
| 340 | * @return array Structured version of $text with its HTML and shortcodes separated. |
| 341 | */ |
| 342 | function wptexturize_parse( $text ) { |
| 343 | $results = array(); // Stores the full shortcode matches. |
| 344 | $results2 = array(); // Stores the HTML matches and is read-only after that. |
| 345 | $final = array(); // Stores the output of the parser. |
| 346 | |
| 347 | // Find shortcodes |
| 348 | $regex = '/' . get_shortcode_regex() . '/s'; |
| 349 | |
| 350 | preg_match_all( $regex, $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| 351 | |
| 352 | $pos = 0; |
| 353 | foreach( $matches as $data ) { |
| 354 | // Collect data for this tag. |
| 355 | $tag = array(); |
| 356 | $tag[1] = $data[0][1]; // Start position |
| 357 | $tag[2] = strpos( $text, ']', $tag[1] ); // End position |
| 358 | $tag[3] = $tag[2] - $tag[1] + 1; // Length |
| 359 | if ( strlen( $data[0][0] ) == $tag[3] + 1 && !empty( $data[6][0] ) ) { |
| 360 | // This is an escaped, non-enclosing shortcode. |
| 361 | $tag[2]++; |
| 362 | $tag[3]++; |
| 363 | } |
| 364 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text |
| 365 | $tag[4] = true; // Is it a shortcode? |
| 366 | $tag[5] = false; // Is it HTML? |
| 367 | |
| 368 | // Was there any text before this tag? |
| 369 | if ( $tag[1] > $pos ) { |
| 370 | $plain = array(); |
| 371 | $plain[1] = $pos; |
| 372 | $plain[2] = $tag[1] - 1; |
| 373 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 374 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| 375 | $plain[4] = false; |
| 376 | $plain[5] = false; |
| 377 | $results[] = $plain; |
| 378 | } |
| 379 | |
| 380 | $results[] = $tag; |
| 381 | |
| 382 | // Is this an enclosing tag? |
| 383 | if ( strlen( $data[0][0] ) > $tag[3] ) { |
| 384 | $close = array(); |
| 385 | |
| 386 | if ( !empty( $data[5][0] ) ) { |
| 387 | $plain = array(); |
| 388 | $plain[0] = $data[5][0]; |
| 389 | $plain[1] = $data[5][1]; |
| 390 | $plain[2] = $plain[1] + strlen( $plain[0] ) - 1; |
| 391 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 392 | $plain[4] = false; |
| 393 | $plain[5] = false; |
| 394 | $results[] = $plain; |
| 395 | |
| 396 | $close[1] = $plain[2] + 1; |
| 397 | } else { |
| 398 | $close[1] = $tag[2] + 1; |
| 399 | } |
| 400 | |
| 401 | $close[2] = $tag[1] + strlen( $data[0][0] ) - 1; |
| 402 | $close[3] = $close[2] - $close[1] + 1; |
| 403 | $close[0] = substr( $text, $close[1], $close[3] ); |
| 404 | $close[4] = true; |
| 405 | $close[5] = false; |
| 406 | $results[] = $close; |
| 407 | |
| 408 | $pos = $close[2] + 1; |
| 409 | } else { |
| 410 | $pos = $tag[2] + 1; |
| 411 | } |
| 412 | } |
| 413 | |
| 414 | // Was there any text after the last tag? |
| 415 | if ( $pos < strlen( $text ) ) { |
| 416 | $plain = array(); |
| 417 | $plain[0] = substr( $text, $pos ); |
| 418 | $plain[1] = $pos; |
| 419 | $plain[3] = strlen( $plain[0] ); |
| 420 | $plain[2] = $plain[1] + $plain[3] - 1; |
| 421 | $plain[4] = false; |
| 422 | $plain[5] = false; |
| 423 | $results[] = $plain; |
| 424 | } |
| 425 | |
| 426 | // Now remove the shortcodes so we can look for the HTML. |
| 427 | $html = array(); |
| 428 | foreach( $results as &$chunk ) { |
| 429 | // Is this chunk a shortcode tag? |
| 430 | if ( $chunk[4] ) { |
| 431 | $html[] = str_repeat( ' ', $chunk[3] ); |
| 432 | } else { |
| 433 | $html[] = $chunk[0]; |
| 434 | } |
| 435 | } |
| 436 | unset( $chunk ); |
| 437 | $html = implode( '', $html ); |
| 438 | |
| 439 | // Now look for HTML. If there are any nested shortcodes, avoid them, |
| 440 | // but do not allow HTML inside the attributes of nested shortcodes. |
| 441 | // As in the Shortcode API, there is no recursion by default. |
| 442 | |
| 443 | $comment_regex = |
| 444 | '!' // Start of comment, after the <. |
| 445 | . '(?:' // Unroll the loop: Consume everything until --> is found. |
| 446 | . '-(?!->)' // Dash not followed by end of comment. |
| 447 | . '[^\-]*+' // Consume non-dashes. |
| 448 | . ')*+' // Loop possessively. |
| 449 | . '(?:-->)?'; // End of comment. If not found, match all input. |
| 450 | |
| 451 | $shortcode_regex = |
| 452 | '\[' // Find start of shortcode. |
| 453 | . '[\/\[]?' // Shortcodes may begin with [/ or [[ |
| 454 | . '[^\s\/\[\]<>]' // No whitespace before name. |
| 455 | . '[^\[\]<>]*+' // Shortcodes do not contain other shortcodes. Possessive critical. |
| 456 | . '\]' // Find end of shortcode. |
| 457 | . '\]?'; // Shortcodes may end with ]] |
| 458 | |
| 459 | $regex = |
| 460 | '/(' // Capture HTML. |
| 461 | . '<' // Find start of element. |
| 462 | . '(?(?=!--)' // Is this a comment? |
| 463 | . $comment_regex // Find end of comment. |
| 464 | . '|' |
| 465 | . '[^>]+>' // Find end of element. |
| 466 | . ')' |
| 467 | . ')|(' // Capture shortcodes. |
| 468 | . $shortcode_regex // Find shortcodes. |
| 469 | . ')/s'; |
| 470 | |
| 471 | preg_match_all( $regex, $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| 472 | unset( $html ); |
| 473 | |
| 474 | foreach( $matches as $data ) { |
| 475 | // Collect data for this tag. |
| 476 | $tag = array(); |
| 477 | $tag[1] = $data[0][1]; // Start position |
| 478 | $tag[3] = strlen( $data[0][0] ); // Length |
| 479 | $tag[2] = $tag[1] + $tag[3] - 1; // End position |
| 480 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text - Using $text to capture original input. |
| 481 | $tag[4] = empty( $data[1][0] ); // Is it a shortcode? |
| 482 | $tag[5] = !$tag[4]; // Is it HTML? |
| 483 | $results2[] = $tag; |
| 484 | } |
| 485 | |
| 486 | // Look for standalone HTML tags. |
| 487 | // Look for HTML tags surrounding shortcodes. |
| 488 | // Look for shortcodes enclosed within shortcodes. |
| 489 | |
| 490 | $next_i = 0; // When inner-looping, there is no need to visit the same nodes every time, so keep track. |
| 491 | $next_start = 0; // Keep track of how much output has been saved. |
| 492 | |
| 493 | foreach( $results2 as $tag ) { |
| 494 | $tag_start_found = false; |
| 495 | |
| 496 | // Is it HTML? |
| 497 | if ( $tag[5] ) { |
| 498 | // Now loop through the shortcodes to see if any of them are inside an HTML element. |
| 499 | // Texturization has been a left-to-right process, and when inside HTML, |
| 500 | // the $no_texturize_tags list gets tested but the $no_texturize_shortcodes list does not. |
| 501 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| 502 | // Test for intersection of HTML in $results2 and shortcode chunks in $results. |
| 503 | $chunk_start = $results[$i][1]; |
| 504 | $chunk_end = $results[$i][2]; |
| 505 | |
| 506 | if ( $chunk_start > $tag[2] ) { |
| 507 | break; |
| 508 | } |
| 509 | |
| 510 | $html_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| 511 | $html_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| 512 | $chunk_starts_in_html = $chunk_start >= $tag[1] && $chunk_start <= $tag[2]; |
| 513 | $chunk_ends_in_html = $chunk_end >= $tag[1] && $chunk_end <= $tag[2]; |
| 514 | |
| 515 | if ( !$html_starts_in_chunk && !$html_ends_in_chunk && !$chunk_starts_in_html && !$chunk_ends_in_html ) { |
| 516 | if ( !$tag_start_found ) { |
| 517 | $chunk = $results[$i]; |
| 518 | if ( $next_start > $chunk[1] ) { |
| 519 | $chunk[1] = $next_start; |
| 520 | $chunk[3] = $chunk[2] - $chunk[1] + 1; |
| 521 | $chunk[0] = substr( $text, $chunk[1], $chunk[3] ); |
| 522 | } |
| 523 | $final[] = $chunk; |
| 524 | $next_i = $i + 1; |
| 525 | $next_start = $results[$i][2] + 1; |
| 526 | } |
| 527 | continue; |
| 528 | } |
| 529 | |
| 530 | // Is the chunk a shortcode? |
| 531 | if ( $results[$i][4] ) { |
| 532 | |
| 533 | // Our regexp never looks for HTML inside of shortcodes, |
| 534 | // so the above tests are adequate to determine this chunk |
| 535 | // is a shortcode contained within an HTML element. |
| 536 | |
| 537 | // Ignore the shortcode node. It's getting merged into the HTML node. |
| 538 | $next_i = $i + 1; |
| 539 | $next_start = $results[$i][2] + 1; |
| 540 | |
| 541 | // Deal with the plain text chunk(s) that needs to be marked as HTML. |
| 542 | } elseif ( $html_starts_in_chunk ) { |
| 543 | $tag_start_found = true; |
| 544 | if ( $tag[1] > $chunk_start ) { |
| 545 | // Truncate the plain text. |
| 546 | $plain = array(); |
| 547 | $plain[1] = $next_start; |
| 548 | $plain[2] = $tag[1] - 1; |
| 549 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 550 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| 551 | $plain[4] = false; |
| 552 | $plain[5] = false; |
| 553 | $final[] = $plain; |
| 554 | } |
| 555 | $final[] = $tag; |
| 556 | if ( $tag[2] < $chunk_end ) { |
| 557 | // Need to visit this chunk again on the next loop. |
| 558 | $next_i = $i--; |
| 559 | $next_start = $tag[2] + 1; |
| 560 | break; |
| 561 | } else { |
| 562 | $next_i = $i + 1; |
| 563 | $next_start = $chunk_end + 1; |
| 564 | } |
| 565 | } elseif ( $html_ends_in_chunk ) { |
| 566 | if ( $tag[2] < $chunk_end ) { |
| 567 | // Need to visit this chunk again on the next loop. |
| 568 | $next_i = $i--; |
| 569 | $next_start = $tag[2] + 1; |
| 570 | } else { |
| 571 | // This chunk just duplicates the end of the tag. Ignore it. |
| 572 | $next_i = $i + 1; |
| 573 | $next_start = $tag[2] + 1; |
| 574 | } |
| 575 | } else { |
| 576 | // Any other chunks (text between shortcodes, inside HTML) can be ignored also. |
| 577 | $next_i = $i + 1; |
| 578 | $next_start = $chunk_end + 1; |
| 579 | } |
| 580 | } |
| 581 | } else { |
| 582 | // It's a shortcode tag. |
| 583 | // Now loop through the plain text chunks to see if any of them look like shortcodes enclosed in shortcodes. |
| 584 | // We still want to avoid texturizing shortcodes, but we do not intend to run the full regexp recursively. |
| 585 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| 586 | // Test for intersection of shortcode-like tags in $results2 and plain text chunks in $results. |
| 587 | $chunk_start = $results[$i][1]; |
| 588 | $chunk_end = $results[$i][2]; |
| 589 | |
| 590 | if ( $chunk_start > $tag[2] ) { |
| 591 | break; |
| 592 | } |
| 593 | |
| 594 | $tag_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| 595 | $tag_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| 596 | |
| 597 | if ( !$tag_starts_in_chunk || !$tag_ends_in_chunk ) { |
| 598 | if ( !$tag_start_found ) { |
| 599 | $chunk = $results[$i]; |
| 600 | if ( $next_start > $chunk[1] ) { |
| 601 | $chunk[1] = $next_start; |
| 602 | $chunk[3] = $chunk[2] - $chunk[1] + 1; |
| 603 | $chunk[0] = substr( $text, $chunk[1], $chunk[3] ); |
| 604 | } |
| 605 | $final[] = $chunk; |
| 606 | $next_i = $i + 1; |
| 607 | $next_start = $results[$i][2] + 1; |
| 608 | } |
| 609 | continue; |
| 610 | } |
| 611 | |
| 612 | $tag_start_found = true; |
| 613 | |
| 614 | if ( $tag[1] > $chunk_start ) { |
| 615 | // Truncate the plain text. |
| 616 | $plain = array(); |
| 617 | $plain[1] = $next_start; |
| 618 | $plain[2] = $tag[1] - 1; |
| 619 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 620 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| 621 | $plain[4] = false; |
| 622 | $plain[5] = false; |
| 623 | $final[] = $plain; |
| 624 | } |
| 625 | $final[] = $tag; |
| 626 | if ( $tag[2] < $chunk_end ) { |
| 627 | $next_i = $i--; |
| 628 | $next_start = $tag[2] + 1; |
| 629 | break; |
| 630 | } else { |
| 631 | $next_i = $i + 1; |
| 632 | $next_start = $chunk_end + 1; |
| 633 | } |
| 634 | } |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | // Now check for plain text and shortcodes after the last HTML tag. |
| 639 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| 640 | $chunk_start = $results[$i][1]; |
| 641 | $chunk_end = $results[$i][2]; |
| 642 | if ( $next_start > $chunk_start ) { |
| 643 | $plain = $results[$i]; |
| 644 | $plain[1] = $next_start; |
| 645 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 646 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| 647 | $final[] = $plain; |
| 648 | } else { |
| 649 | $final[] = $results[$i]; |
| 650 | } |
| 651 | } |
| 652 | |
| 653 | return $final; |
| 654 | } |
| 655 | |
| 656 | /** |