| | 321 | * Parse any string into separate chunks of plain text, HTML, and shortcodes. |
| | 322 | * |
| | 323 | * The return value is a 2-dimensional array of strings and metadata ordered |
| | 324 | * by input, similar to a split string. Each node has these keys and values: |
| | 325 | * |
| | 326 | * key- type - value |
| | 327 | * ------------------ |
| | 328 | * 0 - string - The text that was split into this node from the input. |
| | 329 | * 1 - int - Start position of input. |
| | 330 | * 2 - int - End position of input. |
| | 331 | * 3 - int - Length of text. Always equal to End - Start + 1. |
| | 332 | * 4 - bool - Shortcode flag. |
| | 333 | * 5 - bool - HTML flag. |
| | 334 | * 6 - Reserved for future use to indicate the node is inline or block. |
| | 335 | * |
| | 336 | * The string in key 0 is plain text when keys 4 and 5 are both false. |
| | 337 | * |
| | 338 | * @since 4.0.1 |
| | 339 | * @param string $text The user input that needs to be texturized. |
| | 340 | * @return array Structured version of $text with its HTML and shortcodes separated. |
| | 341 | */ |
| | 342 | function wptexturize_parse( $text ) { |
| | 343 | $results = array(); // Stores the full shortcode matches, then gets updated, and ultimately returned. |
| | 344 | $results2 = array(); // Stores the HTML matches and is read-only after that. |
| | 345 | |
| | 346 | // Find shortcodes |
| | 347 | $regex = '/' . get_shortcode_regex() . '/s'; |
| | 348 | |
| | 349 | preg_match_all( $regex, $text, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| | 350 | |
| | 351 | $pos = 0; |
| | 352 | foreach( $matches as $data ) { |
| | 353 | // Collect data for this tag. |
| | 354 | $tag = array(); |
| | 355 | $tag[1] = $data[0][1]; // Start position |
| | 356 | $tag[2] = strpos( $text, ']', $tag[1] ); // End position |
| | 357 | $tag[3] = $tag[2] - $tag[1] + 1; // Length |
| | 358 | $tag[0] = substr( $text, $tag[1], $tag[3] ); // Tag text |
| | 359 | $tag[4] = true; // Is it a shortcode? |
| | 360 | $tag[5] = false; // Is it HTML? |
| | 361 | |
| | 362 | // Was there any text before this tag? |
| | 363 | if ( $tag[1] > $pos ) { |
| | 364 | $plain = array(); |
| | 365 | $plain[1] = $pos; |
| | 366 | $plain[2] = $tag[1] - 1; |
| | 367 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 368 | $plain[0] = substr( $text, $plain[1], $plain[3] ); |
| | 369 | $plain[4] = false; |
| | 370 | $plain[5] = false; |
| | 371 | $results[] = $plain; |
| | 372 | } |
| | 373 | |
| | 374 | $results[] = $tag; |
| | 375 | |
| | 376 | // Is this an enclosing tag? |
| | 377 | if ( strlen( $data[0][0] ) > $tag[3] ) { |
| | 378 | $close = array(); |
| | 379 | |
| | 380 | if ( !empty( $data[5][0] ) ) { |
| | 381 | $plain = array(); |
| | 382 | $plain[0] = $data[5][0]; |
| | 383 | $plain[1] = $data[5][1]; |
| | 384 | $plain[2] = $plain[1] + strlen( $plain[0] ) - 1; |
| | 385 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 386 | $plain[4] = false; |
| | 387 | $plain[5] = false; |
| | 388 | $results[] = $plain; |
| | 389 | |
| | 390 | $close[1] = $plain[2] + 1; |
| | 391 | } else { |
| | 392 | $close[1] = $tag[2] + 1; |
| | 393 | } |
| | 394 | |
| | 395 | $close[2] = $tag[1] + strlen( $data[0][0] ) - 1; |
| | 396 | $close[3] = $close[2] - $close[1] + 1; |
| | 397 | $close[0] = substr( $text, $close[1], $close[3] ); |
| | 398 | $close[4] = true; |
| | 399 | $close[5] = false; |
| | 400 | $results[] = $close; |
| | 401 | |
| | 402 | $pos = $close[2] + 1; |
| | 403 | } else { |
| | 404 | $pos = $tag[2] + 1; |
| | 405 | } |
| | 406 | } |
| | 407 | |
| | 408 | // Was there any text after the last tag? |
| | 409 | if ( $pos < strlen( $text ) ) { |
| | 410 | $plain = array(); |
| | 411 | $plain[0] = substr( $text, $pos ); |
| | 412 | $plain[1] = $pos; |
| | 413 | $plain[3] = strlen( $plain[0] ); |
| | 414 | $plain[2] = $plain[1] + $plain[3] - 1; |
| | 415 | $plain[4] = false; |
| | 416 | $plain[5] = false; |
| | 417 | $results[] = $plain; |
| | 418 | } |
| | 419 | |
| | 420 | // Now remove the shortcodes so we can look for the HTML. |
| | 421 | $html = array(); |
| | 422 | foreach( $results as &$chunk ) { |
| | 423 | // Is this chunk a shortcode tag? |
| | 424 | if ( $chunk[4] ) { |
| | 425 | $html[] = str_repeat( ' ', $chunk[3] ); |
| | 426 | } else { |
| | 427 | $html[] = $chunk[0]; |
| | 428 | } |
| | 429 | } |
| | 430 | $html = implode( '', $html ); |
| | 431 | |
| | 432 | // Now look for HTML. If there are any nested shortcodes, avoid them, |
| | 433 | // but do not allow HTML inside the attributes of nested shortcodes. |
| | 434 | // As in the Shortcode API, there is no recursion by default. |
| | 435 | |
| | 436 | $comment_regex = |
| | 437 | '!' // Start of comment, after the <. |
| | 438 | . '(?:' // Unroll the loop: Consume everything until --> is found. |
| | 439 | . '-(?!->)' // Dash not followed by end of comment. |
| | 440 | . '[^\-]*+' // Consume non-dashes. |
| | 441 | . ')*+' // Loop possessively. |
| | 442 | . '(?:-->)?'; // End of comment. If not found, match all input. |
| | 443 | |
| | 444 | $shortcode_regex = |
| | 445 | '\[' // Find start of shortcode. |
| | 446 | . '[\/\[]?' // Shortcodes may begin with [/ or [[ |
| | 447 | . '[^\s\/\[\]<>]' // No whitespace before name. |
| | 448 | . '[^\[\]<>]*+' // Shortcodes do not contain other shortcodes. Possessive critical. |
| | 449 | . '\]' // Find end of shortcode. |
| | 450 | . '\]?'; // Shortcodes may end with ]] |
| | 451 | |
| | 452 | $regex = |
| | 453 | '/(' // Capture HTML. |
| | 454 | . '<' // Find start of element. |
| | 455 | . '(?(?=!--)' // Is this a comment? |
| | 456 | . $comment_regex // Find end of comment. |
| | 457 | . '|' |
| | 458 | . '[^>]+>' // Find end of element. |
| | 459 | . ')' |
| | 460 | . ')|(' // Capture shortcodes. |
| | 461 | . $shortcode_regex // Find shortcodes. |
| | 462 | . ')/s'; |
| | 463 | |
| | 464 | preg_match_all( $regex, $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
| | 465 | |
| | 466 | foreach( $matches as $data ) { |
| | 467 | // Collect data for this tag. |
| | 468 | $tag = array(); |
| | 469 | $tag[0] = $data[0][0]; // Tag text |
| | 470 | $tag[1] = $data[0][1]; // Start position |
| | 471 | $tag[3] = strlen( $tag[0] ); // Length |
| | 472 | $tag[2] = $tag[1] + $tag[3] - 1; // End position |
| | 473 | $tag[4] = empty( $data[1][0] ); // Is it a shortcode? |
| | 474 | $tag[5] = !$tag[4]; // Is it HTML? |
| | 475 | $results2[] = $tag; |
| | 476 | } |
| | 477 | |
| | 478 | // Look for standalone HTML tags. |
| | 479 | // Look for HTML tags surrounding shortcodes. |
| | 480 | // Look for shortcodes enclosed within shortcodes. |
| | 481 | |
| | 482 | foreach( $results2 as $tag ) { |
| | 483 | $startfound = false; |
| | 484 | $endfound = false; |
| | 485 | |
| | 486 | // Is it HTML? |
| | 487 | if ( $tag[5] ) { |
| | 488 | // Now loop through the shortcodes to see if any of them are inside an HTML element. |
| | 489 | // Texturization has been a left-to-right process, and when inside HTML, |
| | 490 | // the $no_texturize_tags list gets tested but the $no_texturize_shortcodes list does not. |
| | 491 | for( $i = 0; $i < count( $results ); $i++ ) { |
| | 492 | // Test for intersection of HTML in $results2 and shortcode chunks in $results. |
| | 493 | $chunk_start = $results[$i][1]; |
| | 494 | $chunk_end = $results[$i][2]; |
| | 495 | |
| | 496 | if ( $chunk_start > $tag[2] ) { |
| | 497 | break; |
| | 498 | } |
| | 499 | |
| | 500 | $html_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| | 501 | $html_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| | 502 | $chunk_starts_in_html = $chunk_start >= $tag[1] && $chunk_start <= $tag[2]; |
| | 503 | $chunk_ends_in_html = $chunk_end >= $tag[1] && $chunk_end <= $tag[2]; |
| | 504 | |
| | 505 | if ( !$html_starts_in_chunk && !$html_ends_in_chunk && !$chunk_starts_in_html && !$chunk_ends_in_html ) { |
| | 506 | continue; |
| | 507 | } |
| | 508 | |
| | 509 | // Up here, check for plain text items for start of HTML. |
| | 510 | |
| | 511 | // Is the chunk a shortcode? |
| | 512 | if ( $results[$i][4] ) { |
| | 513 | |
| | 514 | // Our regexp never looks for HTML inside of shortcodes, |
| | 515 | // so the above tests are adequate to determine this chunk |
| | 516 | // is a shortcode contained within an HTML element. |
| | 517 | |
| | 518 | // Delete the shortcode node. It's getting merged into the HTML node. |
| | 519 | array_splice( $results, $i--, 1 ); |
| | 520 | |
| | 521 | // Deal with the plain text chunk(s) that needs to be marked as HTML. |
| | 522 | } elseif ( $html_starts_in_chunk ) { |
| | 523 | if ( $tag[2] < $chunk_end ) { |
| | 524 | // We're going to need at least one extra node, so figure that out first. |
| | 525 | $plain = array(); |
| | 526 | $plain[1] = $tag[2] + 1; |
| | 527 | $plain[2] = $chunk_end; |
| | 528 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 529 | $plain[0] = substr( $results[$i][0], -$plain[3] ); |
| | 530 | $plain[4] = false; |
| | 531 | $plain[5] = false; |
| | 532 | array_splice( $results, $i + 1, 0, array( $plain ) ); |
| | 533 | } |
| | 534 | if ( $tag[1] > $chunk_start ) { |
| | 535 | // Truncate the plain text. |
| | 536 | $results[$i][2] = $tag[1] - 1; |
| | 537 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| | 538 | $results[$i][0] = substr( $results[$i][0], 0, $results[$i][3] ); |
| | 539 | |
| | 540 | // Insert the HTML node after. |
| | 541 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| | 542 | array_splice( $results, ++$i, 0, array( $tag ) ); |
| | 543 | } else { |
| | 544 | // The tag and the chunk could be identical, but just overwrite the chunk for simplicity. |
| | 545 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| | 546 | $results[$i] = $tag; |
| | 547 | } |
| | 548 | } elseif ( $html_ends_in_chunk ) { |
| | 549 | if ( $tag[2] < $chunk_end ) { |
| | 550 | // Truncate the plain text. |
| | 551 | $results[$i][1] = $tag[2] + 1; |
| | 552 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| | 553 | $results[$i][0] = substr( $results[$i][0], -$results[$i][3] ); |
| | 554 | } else { |
| | 555 | // This chunk just duplicates the end of the tag. Remove it now. |
| | 556 | array_splice( $results, $i--, 1 ); |
| | 557 | } |
| | 558 | } else { |
| | 559 | // Any other chunks (text between shortcodes, inside HTML) can be removed now. |
| | 560 | array_splice( $results, $i--, 1 ); |
| | 561 | } |
| | 562 | } |
| | 563 | } else { |
| | 564 | // It's a shortcode tag. |
| | 565 | // Now loop through the plain text chunks to see if any of them look like shortcodes enclosed in shortcodes. |
| | 566 | // We still want to avoid texturizing shortcodes, but we do not intend to run the full regexp recursively. |
| | 567 | for( $i = 0; $i < count( $results ); $i++ ) { |
| | 568 | // Test for intersection of shortcode-like tags in $results2 and plain text chunks in $results. |
| | 569 | $chunk_start = $results[$i][1]; |
| | 570 | $chunk_end = $results[$i][2]; |
| | 571 | |
| | 572 | if ( $chunk_start > $tag[2] ) { |
| | 573 | break; |
| | 574 | } |
| | 575 | |
| | 576 | $tag_starts_in_chunk = $tag[1] >= $chunk_start && $tag[1] <= $chunk_end; |
| | 577 | $tag_ends_in_chunk = $tag[2] >= $chunk_start && $tag[2] <= $chunk_end; |
| | 578 | |
| | 579 | if ( !$tag_starts_in_chunk || !$tag_ends_in_chunk ) { |
| | 580 | continue; |
| | 581 | } |
| | 582 | |
| | 583 | if ( $tag[2] < $chunk_end ) { |
| | 584 | // We're going to need at least one extra node, so figure that out first. |
| | 585 | $plain = array(); |
| | 586 | $plain[1] = $tag[2] + 1; |
| | 587 | $plain[2] = $chunk_end; |
| | 588 | $plain[3] = $plain[2] - $plain[1] + 1; |
| | 589 | $plain[0] = substr( $results[$i][0], -$plain[3] ); |
| | 590 | $plain[4] = false; |
| | 591 | $plain[5] = false; |
| | 592 | array_splice( $results, $i + 1, 0, array( $plain ) ); |
| | 593 | } |
| | 594 | if ( $tag[1] > $chunk_start ) { |
| | 595 | // Truncate the plain text. |
| | 596 | $results[$i][2] = $tag[1] - 1; |
| | 597 | $results[$i][3] = $results[$i][2] - $results[$i][1] + 1; |
| | 598 | $results[$i][0] = substr( $results[$i][0], 0, $results[$i][3] ); |
| | 599 | |
| | 600 | // Insert the shortcode node after. |
| | 601 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| | 602 | array_splice( $results, ++$i, 0, array( $tag ) ); |
| | 603 | } else { |
| | 604 | // The tag and the chunk could be identical, but just overwrite the chunk for simplicity. |
| | 605 | $tag[0] = substr( $text, $tag[1], $tag[3] ); |
| | 606 | $results[$i] = $tag; |
| | 607 | } |
| | 608 | } |
| | 609 | } |
| | 610 | } |
| | 611 | return $results; |
| | 612 | } |
| | 613 | |
| | 614 | /** |