| | 321 | * Parse any string into separate chunks of plain text, HTML, and shortcodes. |
| | 322 | * |
| | 323 | * The return value is a 2-dimensional array of strings and metadata ordered |
| | 324 | * by input, similar to a split string. Each node has these keys and values: |
| | 325 | * |
| | 326 | * key- type - value |
| | 327 | * ------------------ |
| | 328 | * 0 - string - The text that was split into this node from the input. |
| | 329 | * 1 - int - Start position of input. |
| | 330 | * 2 - int - End position of input. |
| | 331 | * 3 - int - Length of text. Always equal to End - Start + 1. |
| | 332 | * 4 - bool - Shortcode flag. |
| | 333 | * 5 - bool - HTML flag. |
| | 334 | * 6 - Reserved for future use to indicate the node is inline or block. |
| | 335 | * |
| | 336 | * The string in key 0 is plain text when keys 4 and 5 are both false. |
| | 337 | * |
| | 338 | * @since 4.0.1 |
| | 339 | * @param string $text The user input that needs to be texturized. |
| | 340 | * @return array Structured version of $text with its HTML and shortcodes separated. |
| | 341 | */ |
| | 342 | function wptexturize_parse( $text ) { |
| | 343 | $results = array(); // Stores the full shortcode matches. |
| | 344 | $results2 = array(); // Stores the HTML matches and is read-only after that. |
| | 345 | $final = array(); // Stores the output of the parser. |
| | 346 | |
| | 347 | // Find shortcodes |
| | 348 | $regex = '/' . get_shortcode_regex() . '/s'; |
| | 349 | |
| | 350 | preg_match_all( $regex, $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| | 351 | |
| | 352 | $pos = 0; |
| | 353 | foreach( $matches as $data ) { |
| | 354 | // Collect data for this tag. |
| | 355 | $tag = array(); |
| | 356 | $tag[1] = $data[0][1]; // Start position |
| | 357 | $tag[2] = strpos( $text, ']', $tag[1] ); // End position |
| | 358 | $tag[3] = $tag[2] - $tag[1] + 1; // Length |
| | 359 | if ( strlen( $data[0][0] ) == $tag[3] + 1 && !empty( $data[6][0] ) ) { |
| | 360 | // This is an escaped, non-enclosing shortcode. |
| | 361 | $tag[2]++; |
| | 362 | $tag[3]++; |
| | 363 | } |
| | 364 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text |
| | 365 | $tag[4] = true; // Is it a shortcode? |
| | 366 | $tag[5] = false; // Is it HTML? |
| | 367 | |
| | 368 | // Was there any text before this tag? |
| | 369 | if ( $tag[1] > $pos ) { |
| | 370 | $plain = array(); |
| | 371 | $plain[1] = $pos; |
| | 372 | $plain[2] = $tag[1] - 1; |
| | 373 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 374 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| | 375 | $plain[4] = false; |
| | 376 | $plain[5] = false; |
| | 377 | $results[] = $plain; |
| | 378 | } |
| | 379 | |
| | 380 | $results[] = $tag; |
| | 381 | |
| | 382 | // Is this an enclosing tag? |
| | 383 | if ( strlen( $data[0][0] ) > $tag[3] ) { |
| | 384 | $close = array(); |
| | 385 | |
| | 386 | if ( !empty( $data[5][0] ) ) { |
| | 387 | $plain = array(); |
| | 388 | $plain[0] = $data[5][0]; |
| | 389 | $plain[1] = $data[5][1]; |
| | 390 | $plain[2] = $plain[1] + strlen( $plain[0] ) - 1; |
| | 391 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 392 | $plain[4] = false; |
| | 393 | $plain[5] = false; |
| | 394 | $results[] = $plain; |
| | 395 | |
| | 396 | $close[1] = $plain[2] + 1; |
| | 397 | } else { |
| | 398 | $close[1] = $tag[2] + 1; |
| | 399 | } |
| | 400 | |
| | 401 | $close[2] = $tag[1] + strlen( $data[0][0] ) - 1; |
| | 402 | $close[3] = $close[2] - $close[1] + 1; |
| | 403 | $close[0] = substr( $text, $close[1], $close[3] ); |
| | 404 | $close[4] = true; |
| | 405 | $close[5] = false; |
| | 406 | $results[] = $close; |
| | 407 | |
| | 408 | $pos = $close[2] + 1; |
| | 409 | } else { |
| | 410 | $pos = $tag[2] + 1; |
| | 411 | } |
| | 412 | } |
| | 413 | |
| | 414 | // Was there any text after the last tag? |
| | 415 | if ( $pos < strlen( $text ) ) { |
| | 416 | $plain = array(); |
| | 417 | $plain[0] = substr( $text, $pos ); |
| | 418 | $plain[1] = $pos; |
| | 419 | $plain[3] = strlen( $plain[0] ); |
| | 420 | $plain[2] = $plain[1] + $plain[3] - 1; |
| | 421 | $plain[4] = false; |
| | 422 | $plain[5] = false; |
| | 423 | $results[] = $plain; |
| | 424 | } |
| | 425 | |
| | 426 | // Now remove the shortcodes so we can look for the HTML. |
| | 427 | $html = array(); |
| | 428 | foreach( $results as &$chunk ) { |
| | 429 | // Is this chunk a shortcode tag? |
| | 430 | if ( $chunk[4] ) { |
| | 431 | $html[] = str_repeat( ' ', $chunk[3] ); |
| | 432 | } else { |
| | 433 | $html[] = $chunk[0]; |
| | 434 | } |
| | 435 | } |
| | 436 | unset( $chunk ); |
| | 437 | $html = implode( '', $html ); |
| | 438 | |
| | 439 | // Now look for HTML. If there are any nested shortcodes, avoid them, |
| | 440 | // but do not allow HTML inside the attributes of nested shortcodes. |
| | 441 | // As in the Shortcode API, there is no recursion by default. |
| | 442 | |
| | 443 | $comment_regex = |
| | 444 | '!' // Start of comment, after the <. |
| | 445 | . '(?:' // Unroll the loop: Consume everything until --> is found. |
| | 446 | . '-(?!->)' // Dash not followed by end of comment. |
| | 447 | . '[^\-]*+' // Consume non-dashes. |
| | 448 | . ')*+' // Loop possessively. |
| | 449 | . '(?:-->)?'; // End of comment. If not found, match all input. |
| | 450 | |
| | 451 | $shortcode_regex = |
| | 452 | '\[' // Find start of shortcode. |
| | 453 | . '[\/\[]?' // Shortcodes may begin with [/ or [[ |
| | 454 | . '[^\s\/\[\]<>]' // No whitespace before name. |
| | 455 | . '[^\[\]<>]*+' // Shortcodes do not contain other shortcodes. Possessive critical. |
| | 456 | . '\]' // Find end of shortcode. |
| | 457 | . '\]?'; // Shortcodes may end with ]] |
| | 458 | |
| | 459 | $regex = |
| | 460 | '/(' // Capture HTML. |
| | 461 | . '<' // Find start of element. |
| | 462 | . '(?(?=!--)' // Is this a comment? |
| | 463 | . $comment_regex // Find end of comment. |
| | 464 | . '|' |
| | 465 | . '[^>]+>' // Find end of element. |
| | 466 | . ')' |
| | 467 | . ')|(' // Capture shortcodes. |
| | 468 | . $shortcode_regex // Find shortcodes. |
| | 469 | . ')/s'; |
| | 470 | |
| | 471 | preg_match_all( $regex, $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| | 472 | unset( $html ); |
| | 473 | |
| | 474 | foreach( $matches as $data ) { |
| | 475 | // Collect data for this tag. |
| | 476 | $tag = array(); |
| | 477 | $tag[1] = $data[0][1]; // Start position |
| | 478 | $tag[3] = strlen( $data[0][0] ); // Length |
| | 479 | $tag[2] = $tag[1] + $tag[3] - 1; // End position |
| | 480 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text - Using $text to capture original input. |
| | 481 | $tag[4] = empty( $data[1][0] ); // Is it a shortcode? |
| | 482 | $tag[5] = !$tag[4]; // Is it HTML? |
| | 483 | $results2[] = $tag; |
| | 484 | } |
| | 485 | |
| | 486 | // Look for standalone HTML tags. |
| | 487 | // Look for HTML tags surrounding shortcodes. |
| | 488 | // Look for shortcodes enclosed within shortcodes. |
| | 489 | |
| | 490 | $next_i = 0; // When inner-looping, there is no need to visit the same nodes every time, so keep track. |
| | 491 | $next_start = 0; // Keep track of how much output has been saved. |
| | 492 | |
| | 493 | foreach( $results2 as $tag ) { |
| | 494 | $tag_start_found = false; |
| | 495 | |
| | 496 | // Is it HTML? |
| | 497 | if ( $tag[5] ) { |
| | 498 | // Now loop through the shortcodes to see if any of them are inside an HTML element. |
| | 499 | // Texturization has been a left-to-right process, and when inside HTML, |
| | 500 | // the $no_texturize_tags list gets tested but the $no_texturize_shortcodes list does not. |
| | 501 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| | 502 | // Test for intersection of HTML in $results2 and shortcode chunks in $results. |
| | 503 | $chunk_start = $results[$i][1]; |
| | 504 | $chunk_end = $results[$i][2]; |
| | 505 | |
| | 506 | if ( $chunk_start > $tag[2] ) { |
| | 507 | break; |
| | 508 | } |
| | 509 | |
| | 510 | $html_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| | 511 | $html_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| | 512 | $chunk_starts_in_html = $chunk_start >= $tag[1] && $chunk_start <= $tag[2]; |
| | 513 | $chunk_ends_in_html = $chunk_end >= $tag[1] && $chunk_end <= $tag[2]; |
| | 514 | |
| | 515 | if ( !$html_starts_in_chunk && !$html_ends_in_chunk && !$chunk_starts_in_html && !$chunk_ends_in_html ) { |
| | 516 | if ( !$tag_start_found ) { |
| | 517 | $chunk = $results[$i]; |
| | 518 | if ( $next_start > $chunk[1] ) { |
| | 519 | $chunk[1] = $next_start; |
| | 520 | $chunk[3] = $chunk[2] - $chunk[1] + 1; |
| | 521 | $chunk[0] = substr( $text, $chunk[1], $chunk[3] ); |
| | 522 | } |
| | 523 | $final[] = $chunk; |
| | 524 | $next_i = $i + 1; |
| | 525 | $next_start = $results[$i][2] + 1; |
| | 526 | } |
| | 527 | continue; |
| | 528 | } |
| | 529 | |
| | 530 | // Is the chunk a shortcode? |
| | 531 | if ( $results[$i][4] ) { |
| | 532 | |
| | 533 | // Our regexp never looks for HTML inside of shortcodes, |
| | 534 | // so the above tests are adequate to determine this chunk |
| | 535 | // is a shortcode contained within an HTML element. |
| | 536 | |
| | 537 | // Ignore the shortcode node. It's getting merged into the HTML node. |
| | 538 | $next_i = $i + 1; |
| | 539 | $next_start = $results[$i][2] + 1; |
| | 540 | |
| | 541 | // Deal with the plain text chunk(s) that needs to be marked as HTML. |
| | 542 | } elseif ( $html_starts_in_chunk ) { |
| | 543 | $tag_start_found = true; |
| | 544 | if ( $tag[1] > $chunk_start ) { |
| | 545 | // Truncate the plain text. |
| | 546 | $plain = array(); |
| | 547 | $plain[1] = $next_start; |
| | 548 | $plain[2] = $tag[1] - 1; |
| | 549 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 550 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| | 551 | $plain[4] = false; |
| | 552 | $plain[5] = false; |
| | 553 | $final[] = $plain; |
| | 554 | } |
| | 555 | $final[] = $tag; |
| | 556 | if ( $tag[2] < $chunk_end ) { |
| | 557 | // Need to visit this chunk again on the next loop. |
| | 558 | $next_i = $i--; |
| | 559 | $next_start = $tag[2] + 1; |
| | 560 | break; |
| | 561 | } else { |
| | 562 | $next_i = $i + 1; |
| | 563 | $next_start = $chunk_end + 1; |
| | 564 | } |
| | 565 | } elseif ( $html_ends_in_chunk ) { |
| | 566 | if ( $tag[2] < $chunk_end ) { |
| | 567 | // Need to visit this chunk again on the next loop. |
| | 568 | $next_i = $i--; |
| | 569 | $next_start = $tag[2] + 1; |
| | 570 | } else { |
| | 571 | // This chunk just duplicates the end of the tag. Ignore it. |
| | 572 | $next_i = $i + 1; |
| | 573 | $next_start = $tag[2] + 1; |
| | 574 | } |
| | 575 | } else { |
| | 576 | // Any other chunks (text between shortcodes, inside HTML) can be ignored also. |
| | 577 | $next_i = $i + 1; |
| | 578 | $next_start = $chunk_end + 1; |
| | 579 | } |
| | 580 | } |
| | 581 | } else { |
| | 582 | // It's a shortcode tag. |
| | 583 | // Now loop through the plain text chunks to see if any of them look like shortcodes enclosed in shortcodes. |
| | 584 | // We still want to avoid texturizing shortcodes, but we do not intend to run the full regexp recursively. |
| | 585 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| | 586 | // Test for intersection of shortcode-like tags in $results2 and plain text chunks in $results. |
| | 587 | $chunk_start = $results[$i][1]; |
| | 588 | $chunk_end = $results[$i][2]; |
| | 589 | |
| | 590 | if ( $chunk_start > $tag[2] ) { |
| | 591 | break; |
| | 592 | } |
| | 593 | |
| | 594 | $tag_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| | 595 | $tag_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| | 596 | |
| | 597 | if ( !$tag_starts_in_chunk || !$tag_ends_in_chunk ) { |
| | 598 | if ( !$tag_start_found ) { |
| | 599 | $chunk = $results[$i]; |
| | 600 | if ( $next_start > $chunk[1] ) { |
| | 601 | $chunk[1] = $next_start; |
| | 602 | $chunk[3] = $chunk[2] - $chunk[1] + 1; |
| | 603 | $chunk[0] = substr( $text, $chunk[1], $chunk[3] ); |
| | 604 | } |
| | 605 | $final[] = $chunk; |
| | 606 | $next_i = $i + 1; |
| | 607 | $next_start = $results[$i][2] + 1; |
| | 608 | } |
| | 609 | continue; |
| | 610 | } |
| | 611 | |
| | 612 | $tag_start_found = true; |
| | 613 | |
| | 614 | if ( $tag[1] > $chunk_start ) { |
| | 615 | // Truncate the plain text. |
| | 616 | $plain = array(); |
| | 617 | $plain[1] = $next_start; |
| | 618 | $plain[2] = $tag[1] - 1; |
| | 619 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 620 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| | 621 | $plain[4] = false; |
| | 622 | $plain[5] = false; |
| | 623 | $final[] = $plain; |
| | 624 | } |
| | 625 | $final[] = $tag; |
| | 626 | if ( $tag[2] < $chunk_end ) { |
| | 627 | $next_i = $i--; |
| | 628 | $next_start = $tag[2] + 1; |
| | 629 | break; |
| | 630 | } else { |
| | 631 | $next_i = $i + 1; |
| | 632 | $next_start = $chunk_end + 1; |
| | 633 | } |
| | 634 | } |
| | 635 | } |
| | 636 | } |
| | 637 | |
| | 638 | // Now check for plain text and shortcodes after the last HTML tag. |
| | 639 | for( $i = $next_i; $i < count( $results ); $i++ ) { |
| | 640 | $chunk_start = $results[$i][1]; |
| | 641 | $chunk_end = $results[$i][2]; |
| | 642 | if ( $next_start > $chunk_start ) { |
| | 643 | $plain = $results[$i]; |
| | 644 | $plain[1] = $next_start; |
| | 645 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 646 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| | 647 | $final[] = $plain; |
| | 648 | } else { |
| | 649 | $final[] = $results[$i]; |
| | 650 | } |
| | 651 | } |
| | 652 | |
| | 653 | return $final; |
| | 654 | } |
| | 655 | |
| | 656 | /** |