WordPress.org

Make WordPress Core


Ignore:
Timestamp:
02/10/2012 01:42:15 PM (8 years ago)
Author:
duck_
Message:

Improve efficiency of make_clickable(). Props mdawaffe. Fixes #16892.

Not only does this improve general performance, but also helps to prevent
segfaults caused by malicious input to the regular expression. The regular
expression is also simplified to help readability and maintenance.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/wp-includes/formatting.php

    r19796 r19899  
    13881388function _make_url_clickable_cb($matches) {
    13891389    $url = $matches[2];
    1390     $suffix = '';
    1391 
    1392     /** Include parentheses in the URL only if paired **/
     1390
     1391    if ( ')' == $matches[3] && strpos( $url, '(' ) ) {
     1392        // If the trailing character is a closing parethesis, and the URL has an opening parenthesis in it, add the closing parenthesis to the URL.
     1393        // Then we can let the parenthesis balancer do its thing below.
     1394        $url .= $matches[3];
     1395        $suffix = '';
     1396    } else {
     1397        $suffix = $matches[3];
     1398    }
     1399
     1400    // Include parentheses in the URL only if paired
    13931401    while ( substr_count( $url, '(' ) < substr_count( $url, ')' ) ) {
    13941402        $suffix = strrchr( $url, ')' ) . $suffix;
     
    14591467 * @return string Content with converted URIs.
    14601468 */
    1461 function make_clickable($ret) {
    1462     $ret = ' ' . $ret;
    1463     // in testing, using arrays here was found to be faster
    1464     $save = @ini_set('pcre.recursion_limit', 10000);
    1465     $retval = preg_replace_callback('#(?<!=[\'"])(?<=[*\')+.,;:!&$\s>])(\()?([\w]+?://(?:[\w\\x80-\\xff\#%~/?@\[\]-]{1,2000}|[\'*(+.,;:!=&$](?![\b\)]|(\))?([\s]|$))|(?(1)\)(?![\s<.,;:]|$)|\)))+)#is', '_make_url_clickable_cb', $ret);
    1466     if (null !== $retval )
    1467         $ret = $retval;
    1468     @ini_set('pcre.recursion_limit', $save);
     1469function make_clickable( $ret ) {
     1470    // Long strings might contain expensive edge cases ...
     1471    if ( 10000 < strlen( $ret ) ) {
     1472        $r = '';
     1473        // ... break it up
     1474        foreach ( _split_str_by_whitespace( $ret, 2100 ) as $chunk ) { // 2100: Extra room for scheme and leading and trailing paretheses
     1475            if ( 2101 < strlen( $chunk ) ) {
     1476                $r .= $chunk; // Too big, no whitespace: bail.
     1477            } else {
     1478                $r .= make_clickable( $chunk );
     1479            }
     1480        }
     1481        return $r;
     1482    }
     1483
     1484    $ret = " $ret "; // Pad with whitespace to simplify the regexes
     1485
     1486    $url_clickable = '~
     1487        ([\\s(<.,;:!?])                                        # 1: Leading whitespace, or punctuation
     1488        (                                                      # 2: URL
     1489            [\\w]{1,20}+://                                # Scheme and hier-part prefix
     1490            (?=\S{1,2000}\s)                               # Limit to URLs less than about 2000 characters long
     1491            [\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]*+         # Non-punctuation URL character
     1492            (?:                                            # Unroll the Loop: Only allow puctuation URL character if followed by a non-punctuation URL character
     1493                [\'.,;:!?)]                            # Punctuation URL character
     1494                [\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]++ # Non-punctuation URL character
     1495            )*
     1496        )
     1497        (\)?)                                                  # 3: Trailing closing parenthesis (for parethesis balancing post processing)
     1498    ~xS'; // The regex is a non-anchored pattern and does not have a single fixed starting character.
     1499          // Tell PCRE to spend more time optimizing since, when used on a page load, it will probably be used several times.
     1500
     1501    $ret = preg_replace_callback( $url_clickable, '_make_url_clickable_cb', $ret );
     1502
    14691503    $ret = preg_replace_callback('#([\s>])((www|ftp)\.[\w\\x80-\\xff\#$%&~/.\-;:=,?@\[\]+]+)#is', '_make_web_ftp_clickable_cb', $ret);
    14701504    $ret = preg_replace_callback('#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', '_make_email_clickable_cb', $ret);
    1471     // this one is not in an array because we need it to run last, for cleanup of accidental links within links
     1505
     1506    // Cleanup of accidental links within links
    14721507    $ret = preg_replace("#(<a( [^>]+?>|>))<a [^>]+?>([^>]+?)</a></a>#i", "$1$3</a>", $ret);
    1473     $ret = trim($ret);
    1474     return $ret;
     1508    return substr( $ret, 1, -1 ); // Remove our whitespace padding.
     1509}
     1510
     1511/**
     1512 * Breaks a string into chunks by splitting at whitespace characters.
     1513 * The length of each returned chunk is as close to the specified length goal as possible,
     1514 * with the caveat that each chunk includes its trailing delimiter.
     1515 * Chunks longer than the goal are guaranteed to not have any inner whitespace.
     1516 *
     1517 * Joining the returned chunks with empty delimiters reconstructs the input string losslessly.
     1518 *
     1519 * Input string must have no null characters (or eventual transformations on output chunks must not care about null characters)
     1520 *
     1521 * <code>
     1522 * _split_str_by_whitespace( "1234 67890 1234 67890a cd 1234   890 123456789 1234567890a    45678   1 3 5 7 90 ", 10 ) ==
     1523 * array (
     1524 *   0 => '1234 67890 ',  // 11 characters: Perfect split
     1525 *   1 => '1234 ',        //  5 characters: '1234 67890a' was too long
     1526 *   2 => '67890a cd ',   // 10 characters: '67890a cd 1234' was too long
     1527 *   3 => '1234   890 ',  // 11 characters: Perfect split
     1528 *   4 => '123456789 ',   // 10 characters: '123456789 1234567890a' was too long
     1529 *   5 => '1234567890a ', // 12 characters: Too long, but no inner whitespace on which to split
     1530 *   6 => '   45678   ',  // 11 characters: Perfect split
     1531 *   7 => '1 3 5 7 9',    //  9 characters: End of $string
     1532 * );
     1533 * </code>
     1534 *
     1535 * @param string $string The string to split
     1536 * @param    int $goal   The desired chunk length.
     1537 *
     1538 * @return array Numeric array of chunks.
     1539 */
     1540function _split_str_by_whitespace( $string, $goal ) {
     1541    $chunks = array();
     1542
     1543    $string_nullspace = strtr( $string, "\r\n\t\v\f ", "\000\000\000\000\000\000" );
     1544
     1545    while ( $goal < strlen( $string_nullspace ) ) {
     1546        $pos = strrpos( substr( $string_nullspace, 0, $goal + 1 ), "\000" );
     1547
     1548        if ( false === $pos ) {
     1549            $pos = strpos( $string_nullspace, "\000", $goal + 1 );
     1550            if ( false === $pos ) {
     1551                break;
     1552            }
     1553        }
     1554
     1555        $chunks[] = substr( $string, 0, $pos + 1 );
     1556        $string = substr( $string, $pos + 1 );
     1557        $string_nullspace = substr( $string_nullspace, $pos + 1 );
     1558    }
     1559
     1560    if ( $string ) {
     1561        $chunks[] = $string;
     1562    }
     1563
     1564    return $chunks;
    14751565}
    14761566
Note: See TracChangeset for help on using the changeset viewer.