| 321 | * Parse any string into separate chunks of plain text, HTML, and shortcodes. |
| 322 | * |
| 323 | * The return value is a 2-dimensional array of strings and metadata ordered |
| 324 | * by input, similar to a split string. Each node has these keys and values: |
| 325 | * |
| 326 | * key- type - value |
| 327 | * ------------------ |
| 328 | * 0 - string - The text that was split into this node from the input. |
| 329 | * 1 - int - Start position of input. |
| 330 | * 2 - int - End position of input. |
| 331 | * 3 - int - Length of text. Always equal to End - Start + 1. |
| 332 | * 4 - bool - Shortcode flag. |
| 333 | * 5 - bool - HTML flag. |
| 334 | * 6 - Reserved for future use to indicate the node is inline or block. |
| 335 | * |
| 336 | * The string in key 0 is plain text when keys 4 and 5 are both false. |
| 337 | * |
| 338 | * @since 4.0.1 |
| 339 | * @param string $text The user input that needs to be texturized. |
| 340 | * @return array Structured version of $text with its HTML and shortcodes separated. |
| 341 | */ |
| 342 | function wptexturize_parse( $text ) { |
| 343 | $results = array(); // Stores the full shortcode matches, then gets updated, and ultimately returned. |
| 344 | $results2 = array(); // Stores the HTML matches and is read-only after that. |
| 345 | |
| 346 | // Find shortcodes |
| 347 | $regex = '/' . get_shortcode_regex() . '/s'; |
| 348 | |
| 349 | preg_match_all( $regex, $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| 350 | |
| 351 | $pos = 0; |
| 352 | foreach( $matches as $data ) { |
| 353 | // Collect data for this tag. |
| 354 | $tag = array(); |
| 355 | $tag[1] = $data[0][1]; // Start position |
| 356 | $tag[2] = strpos( $text, ']', $tag[1] ); // End position |
| 357 | $tag[3] = $tag[2] - $tag[1] + 1; // Length |
| 358 | if ( strlen( $data[0][0] ) == $tag[3] + 1 && !empty( $data[6][0] ) ) { |
| 359 | // This is an escaped, non-enclosing shortcode. |
| 360 | $tag[2]++; |
| 361 | $tag[3]++; |
| 362 | } |
| 363 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text |
| 364 | $tag[4] = true; // Is it a shortcode? |
| 365 | $tag[5] = false; // Is it HTML? |
| 366 | |
| 367 | // Was there any text before this tag? |
| 368 | if ( $tag[1] > $pos ) { |
| 369 | $plain = array(); |
| 370 | $plain[1] = $pos; |
| 371 | $plain[2] = $tag[1] - 1; |
| 372 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 373 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| 374 | $plain[4] = false; |
| 375 | $plain[5] = false; |
| 376 | $results[] = $plain; |
| 377 | } |
| 378 | |
| 379 | $results[] = $tag; |
| 380 | |
| 381 | // Is this an enclosing tag? |
| 382 | if ( strlen( $data[0][0] ) > $tag[3] ) { |
| 383 | $close = array(); |
| 384 | |
| 385 | if ( !empty( $data[5][0] ) ) { |
| 386 | $plain = array(); |
| 387 | $plain[0] = $data[5][0]; |
| 388 | $plain[1] = $data[5][1]; |
| 389 | $plain[2] = $plain[1] + strlen( $plain[0] ) - 1; |
| 390 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 391 | $plain[4] = false; |
| 392 | $plain[5] = false; |
| 393 | $results[] = $plain; |
| 394 | |
| 395 | $close[1] = $plain[2] + 1; |
| 396 | } else { |
| 397 | $close[1] = $tag[2] + 1; |
| 398 | } |
| 399 | |
| 400 | $close[2] = $tag[1] + strlen( $data[0][0] ) - 1; |
| 401 | $close[3] = $close[2] - $close[1] + 1; |
| 402 | $close[0] = substr( $text, $close[1], $close[3] ); |
| 403 | $close[4] = true; |
| 404 | $close[5] = false; |
| 405 | $results[] = $close; |
| 406 | |
| 407 | $pos = $close[2] + 1; |
| 408 | } else { |
| 409 | $pos = $tag[2] + 1; |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | // Was there any text after the last tag? |
| 414 | if ( $pos < strlen( $text ) ) { |
| 415 | $plain = array(); |
| 416 | $plain[0] = substr( $text, $pos ); |
| 417 | $plain[1] = $pos; |
| 418 | $plain[3] = strlen( $plain[0] ); |
| 419 | $plain[2] = $plain[1] + $plain[3] - 1; |
| 420 | $plain[4] = false; |
| 421 | $plain[5] = false; |
| 422 | $results[] = $plain; |
| 423 | } |
| 424 | |
| 425 | // Now remove the shortcodes so we can look for the HTML. |
| 426 | $html = array(); |
| 427 | foreach( $results as &$chunk ) { |
| 428 | // Is this chunk a shortcode tag? |
| 429 | if ( $chunk[4] ) { |
| 430 | $html[] = str_repeat( ' ', $chunk[3] ); |
| 431 | } else { |
| 432 | $html[] = $chunk[0]; |
| 433 | } |
| 434 | } |
| 435 | $html = implode( '', $html ); |
| 436 | |
| 437 | // Now look for HTML. If there are any nested shortcodes, avoid them, |
| 438 | // but do not allow HTML inside the attributes of nested shortcodes. |
| 439 | // As in the Shortcode API, there is no recursion by default. |
| 440 | |
| 441 | $comment_regex = |
| 442 | '!' // Start of comment, after the <. |
| 443 | . '(?:' // Unroll the loop: Consume everything until --> is found. |
| 444 | . '-(?!->)' // Dash not followed by end of comment. |
| 445 | . '[^\-]*+' // Consume non-dashes. |
| 446 | . ')*+' // Loop possessively. |
| 447 | . '(?:-->)?'; // End of comment. If not found, match all input. |
| 448 | |
| 449 | $shortcode_regex = |
| 450 | '\[' // Find start of shortcode. |
| 451 | . '[\/\[]?' // Shortcodes may begin with [/ or [[ |
| 452 | . '[^\s\/\[\]<>]' // No whitespace before name. |
| 453 | . '[^\[\]<>]*+' // Shortcodes do not contain other shortcodes. Possessive critical. |
| 454 | . '\]' // Find end of shortcode. |
| 455 | . '\]?'; // Shortcodes may end with ]] |
| 456 | |
| 457 | $regex = |
| 458 | '/(' // Capture HTML. |
| 459 | . '<' // Find start of element. |
| 460 | . '(?(?=!--)' // Is this a comment? |
| 461 | . $comment_regex // Find end of comment. |
| 462 | . '|' |
| 463 | . '[^>]+>' // Find end of element. |
| 464 | . ')' |
| 465 | . ')|(' // Capture shortcodes. |
| 466 | . $shortcode_regex // Find shortcodes. |
| 467 | . ')/s'; |
| 468 | |
| 469 | preg_match_all( $regex, $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| 470 | |
| 471 | foreach( $matches as $data ) { |
| 472 | // Collect data for this tag. |
| 473 | $tag = array(); |
| 474 | $tag[0] = $data[0][0]; // Tag text |
| 475 | $tag[1] = $data[0][1]; // Start position |
| 476 | $tag[3] = strlen( $tag[0] ); // Length |
| 477 | $tag[2] = $tag[1] + $tag[3] - 1; // End position |
| 478 | $tag[4] = empty( $data[1][0] ); // Is it a shortcode? |
| 479 | $tag[5] = !$tag[4]; // Is it HTML? |
| 480 | $results2[] = $tag; |
| 481 | } |
| 482 | |
| 483 | // Look for standalone HTML tags. |
| 484 | // Look for HTML tags surrounding shortcodes. |
| 485 | // Look for shortcodes enclosed within shortcodes. |
| 486 | |
| 487 | foreach( $results2 as $tag ) { |
| 488 | $startfound = false; |
| 489 | $endfound = false; |
| 490 | |
| 491 | // Is it HTML? |
| 492 | if ( $tag[5] ) { |
| 493 | // Now loop through the shortcodes to see if any of them are inside an HTML element. |
| 494 | // Texturization has been a left-to-right process, and when inside HTML, |
| 495 | // the $no_texturize_tags list gets tested but the $no_texturize_shortcodes list does not. |
| 496 | for( $i = 0; $i < count( $results ); $i++ ) { |
| 497 | // Test for intersection of HTML in $results2 and shortcode chunks in $results. |
| 498 | $chunk_start = $results[$i][1]; |
| 499 | $chunk_end = $results[$i][2]; |
| 500 | |
| 501 | if ( $chunk_start > $tag[2] ) { |
| 502 | break; |
| 503 | } |
| 504 | |
| 505 | $html_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| 506 | $html_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| 507 | $chunk_starts_in_html = $chunk_start >= $tag[1] && $chunk_start <= $tag[2]; |
| 508 | $chunk_ends_in_html = $chunk_end >= $tag[1] && $chunk_end <= $tag[2]; |
| 509 | |
| 510 | if ( !$html_starts_in_chunk && !$html_ends_in_chunk && !$chunk_starts_in_html && !$chunk_ends_in_html ) { |
| 511 | continue; |
| 512 | } |
| 513 | |
| 514 | // Is the chunk a shortcode? |
| 515 | if ( $results[$i][4] ) { |
| 516 | |
| 517 | // Our regexp never looks for HTML inside of shortcodes, |
| 518 | // so the above tests are adequate to determine this chunk |
| 519 | // is a shortcode contained within an HTML element. |
| 520 | |
| 521 | // Delete the shortcode node. It's getting merged into the HTML node. |
| 522 | array_splice( $results, $i--, 1 ); |
| 523 | |
| 524 | // Deal with the plain text chunk(s) that needs to be marked as HTML. |
| 525 | } elseif ( $html_starts_in_chunk ) { |
| 526 | if ( $tag[2] < $chunk_end ) { |
| 527 | // We're going to need at least one extra node, so figure that out first. |
| 528 | $plain = array(); |
| 529 | $plain[1] = $tag[2] + 1; |
| 530 | $plain[2] = $chunk_end; |
| 531 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 532 | $plain[0] = substr( $results[$i][0], -$plain[3] ); |
| 533 | $plain[4] = false; |
| 534 | $plain[5] = false; |
| 535 | array_splice( $results, $i + 1, 0, array( $plain ) ); |
| 536 | } |
| 537 | if ( $tag[1] > $chunk_start ) { |
| 538 | // Truncate the plain text. |
| 539 | $results[$i][2] = $tag[1] - 1; |
| 540 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| 541 | $results[$i][0] = substr( $results[$i][0], 0, $results[$i][3] ); |
| 542 | |
| 543 | // Insert the HTML node after. |
| 544 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| 545 | array_splice( $results, ++$i, 0, array( $tag ) ); |
| 546 | } else { |
| 547 | // The tag and the chunk could be identical, but just overwrite the chunk for simplicity. |
| 548 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| 549 | $results[$i] = $tag; |
| 550 | } |
| 551 | } elseif ( $html_ends_in_chunk ) { |
| 552 | if ( $tag[2] < $chunk_end ) { |
| 553 | // Truncate the plain text. |
| 554 | $results[$i][1] = $tag[2] + 1; |
| 555 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| 556 | $results[$i][0] = substr( $results[$i][0], -$results[$i][3] ); |
| 557 | } else { |
| 558 | // This chunk just duplicates the end of the tag. Remove it now. |
| 559 | array_splice( $results, $i--, 1 ); |
| 560 | } |
| 561 | } else { |
| 562 | // Any other chunks (text between shortcodes, inside HTML) can be removed now. |
| 563 | array_splice( $results, $i--, 1 ); |
| 564 | } |
| 565 | } |
| 566 | } else { |
| 567 | // It's a shortcode tag. |
| 568 | // Now loop through the plain text chunks to see if any of them look like shortcodes enclosed in shortcodes. |
| 569 | // We still want to avoid texturizing shortcodes, but we do not intend to run the full regexp recursively. |
| 570 | for( $i = 0; $i < count( $results ); $i++ ) { |
| 571 | // Test for intersection of shortcode-like tags in $results2 and plain text chunks in $results. |
| 572 | $chunk_start = $results[$i][1]; |
| 573 | $chunk_end = $results[$i][2]; |
| 574 | |
| 575 | if ( $chunk_start > $tag[2] ) { |
| 576 | break; |
| 577 | } |
| 578 | |
| 579 | $tag_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| 580 | $tag_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| 581 | |
| 582 | if ( !$tag_starts_in_chunk || !$tag_ends_in_chunk ) { |
| 583 | continue; |
| 584 | } |
| 585 | |
| 586 | if ( $tag[2] < $chunk_end ) { |
| 587 | // We're going to need at least one extra node, so figure that out first. |
| 588 | $plain = array(); |
| 589 | $plain[1] = $tag[2] + 1; |
| 590 | $plain[2] = $chunk_end; |
| 591 | $plain[3] = $plain[2] - $plain[1] + 1; |
| 592 | $plain[0] = substr( $results[$i][0], -$plain[3] ); |
| 593 | $plain[4] = false; |
| 594 | $plain[5] = false; |
| 595 | array_splice( $results, $i + 1, 0, array( $plain ) ); |
| 596 | } |
| 597 | if ( $tag[1] > $chunk_start ) { |
| 598 | // Truncate the plain text. |
| 599 | $results[$i][2] = $tag[1] - 1; |
| 600 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| 601 | $results[$i][0] = substr( $results[$i][0], 0, $results[$i][3] ); |
| 602 | |
| 603 | // Insert the shortcode node after. |
| 604 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| 605 | array_splice( $results, ++$i, 0, array( $tag ) ); |
| 606 | } else { |
| 607 | // The tag and the chunk could be identical, but just overwrite the chunk for simplicity. |
| 608 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| 609 | $results[$i] = $tag; |
| 610 | } |
| 611 | } |
| 612 | } |
| 613 | } |
| 614 | return $results; |
| 615 | } |
| 616 | |
| 617 | /** |