WordPress.org

Make WordPress Core

Changeset 19899


Ignore:
Timestamp:
02/10/12 13:42:15 (2 years ago)
Author:
duck_
Message:

Improve efficiency of make_clickable(). Props mdawaffe. Fixes #16892.

Not only does this improve general performance, but also helps to prevent
segfaults caused by malicious input to the regular expression. The regular
expression is also simplified to help readability and maintenance.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/wp-includes/formatting.php

    r19796 r19899  
    13881388function _make_url_clickable_cb($matches) { 
    13891389    $url = $matches[2]; 
    1390     $suffix = ''; 
    1391  
    1392     /** Include parentheses in the URL only if paired **/ 
     1390 
     1391    if ( ')' == $matches[3] && strpos( $url, '(' ) ) { 
     1392        // If the trailing character is a closing parethesis, and the URL has an opening parenthesis in it, add the closing parenthesis to the URL. 
     1393        // Then we can let the parenthesis balancer do its thing below. 
     1394        $url .= $matches[3]; 
     1395        $suffix = ''; 
     1396    } else { 
     1397        $suffix = $matches[3]; 
     1398    } 
     1399 
     1400    // Include parentheses in the URL only if paired 
    13931401    while ( substr_count( $url, '(' ) < substr_count( $url, ')' ) ) { 
    13941402        $suffix = strrchr( $url, ')' ) . $suffix; 
     
    14591467 * @return string Content with converted URIs. 
    14601468 */ 
    1461 function make_clickable($ret) { 
    1462     $ret = ' ' . $ret; 
    1463     // in testing, using arrays here was found to be faster 
    1464     $save = @ini_set('pcre.recursion_limit', 10000); 
    1465     $retval = preg_replace_callback('#(?<!=[\'"])(?<=[*\')+.,;:!&$\s>])(\()?([\w]+?://(?:[\w\\x80-\\xff\#%~/?@\[\]-]{1,2000}|[\'*(+.,;:!=&$](?![\b\)]|(\))?([\s]|$))|(?(1)\)(?![\s<.,;:]|$)|\)))+)#is', '_make_url_clickable_cb', $ret); 
    1466     if (null !== $retval ) 
    1467         $ret = $retval; 
    1468     @ini_set('pcre.recursion_limit', $save); 
     1469function make_clickable( $ret ) { 
     1470    // Long strings might contain expensive edge cases ... 
     1471    if ( 10000 < strlen( $ret ) ) { 
     1472        $r = ''; 
     1473        // ... break it up 
     1474        foreach ( _split_str_by_whitespace( $ret, 2100 ) as $chunk ) { // 2100: Extra room for scheme and leading and trailing paretheses 
     1475            if ( 2101 < strlen( $chunk ) ) { 
     1476                $r .= $chunk; // Too big, no whitespace: bail. 
     1477            } else { 
     1478                $r .= make_clickable( $chunk ); 
     1479            } 
     1480        } 
     1481        return $r; 
     1482    } 
     1483 
     1484    $ret = " $ret "; // Pad with whitespace to simplify the regexes 
     1485 
     1486    $url_clickable = '~ 
     1487        ([\\s(<.,;:!?])                                        # 1: Leading whitespace, or punctuation 
     1488        (                                                      # 2: URL 
     1489            [\\w]{1,20}+://                                # Scheme and hier-part prefix 
     1490            (?=\S{1,2000}\s)                               # Limit to URLs less than about 2000 characters long 
     1491            [\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]*+         # Non-punctuation URL character 
     1492            (?:                                            # Unroll the Loop: Only allow puctuation URL character if followed by a non-punctuation URL character 
     1493                [\'.,;:!?)]                            # Punctuation URL character 
     1494                [\\w\\x80-\\xff#%\\~/@\\[\\]*(+=&$-]++ # Non-punctuation URL character 
     1495            )* 
     1496        ) 
     1497        (\)?)                                                  # 3: Trailing closing parenthesis (for parethesis balancing post processing) 
     1498    ~xS'; // The regex is a non-anchored pattern and does not have a single fixed starting character. 
     1499          // Tell PCRE to spend more time optimizing since, when used on a page load, it will probably be used several times. 
     1500 
     1501    $ret = preg_replace_callback( $url_clickable, '_make_url_clickable_cb', $ret ); 
     1502 
    14691503    $ret = preg_replace_callback('#([\s>])((www|ftp)\.[\w\\x80-\\xff\#$%&~/.\-;:=,?@\[\]+]+)#is', '_make_web_ftp_clickable_cb', $ret); 
    14701504    $ret = preg_replace_callback('#([\s>])([.0-9a-z_+-]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,})#i', '_make_email_clickable_cb', $ret); 
    1471     // this one is not in an array because we need it to run last, for cleanup of accidental links within links 
     1505 
     1506    // Cleanup of accidental links within links 
    14721507    $ret = preg_replace("#(<a( [^>]+?>|>))<a [^>]+?>([^>]+?)</a></a>#i", "$1$3</a>", $ret); 
    1473     $ret = trim($ret); 
    1474     return $ret; 
     1508    return substr( $ret, 1, -1 ); // Remove our whitespace padding. 
     1509} 
     1510 
     1511/** 
     1512 * Breaks a string into chunks by splitting at whitespace characters. 
     1513 * The length of each returned chunk is as close to the specified length goal as possible, 
     1514 * with the caveat that each chunk includes its trailing delimiter. 
     1515 * Chunks longer than the goal are guaranteed to not have any inner whitespace. 
     1516 * 
     1517 * Joining the returned chunks with empty delimiters reconstructs the input string losslessly. 
     1518 * 
     1519 * Input string must have no null characters (or eventual transformations on output chunks must not care about null characters) 
     1520 * 
     1521 * <code> 
     1522 * _split_str_by_whitespace( "1234 67890 1234 67890a cd 1234   890 123456789 1234567890a    45678   1 3 5 7 90 ", 10 ) == 
     1523 * array ( 
     1524 *   0 => '1234 67890 ',  // 11 characters: Perfect split 
     1525 *   1 => '1234 ',        //  5 characters: '1234 67890a' was too long 
     1526 *   2 => '67890a cd ',   // 10 characters: '67890a cd 1234' was too long 
     1527 *   3 => '1234   890 ',  // 11 characters: Perfect split 
     1528 *   4 => '123456789 ',   // 10 characters: '123456789 1234567890a' was too long 
     1529 *   5 => '1234567890a ', // 12 characters: Too long, but no inner whitespace on which to split 
     1530 *   6 => '   45678   ',  // 11 characters: Perfect split 
     1531 *   7 => '1 3 5 7 9',    //  9 characters: End of $string 
     1532 * ); 
     1533 * </code> 
     1534 * 
     1535 * @param string $string The string to split 
     1536 * @param    int $goal   The desired chunk length. 
     1537 * 
     1538 * @return array Numeric array of chunks. 
     1539 */ 
     1540function _split_str_by_whitespace( $string, $goal ) { 
     1541    $chunks = array(); 
     1542 
     1543    $string_nullspace = strtr( $string, "\r\n\t\v\f ", "\000\000\000\000\000\000" ); 
     1544 
     1545    while ( $goal < strlen( $string_nullspace ) ) { 
     1546        $pos = strrpos( substr( $string_nullspace, 0, $goal + 1 ), "\000" ); 
     1547 
     1548        if ( false === $pos ) { 
     1549            $pos = strpos( $string_nullspace, "\000", $goal + 1 ); 
     1550            if ( false === $pos ) { 
     1551                break; 
     1552            } 
     1553        } 
     1554 
     1555        $chunks[] = substr( $string, 0, $pos + 1 ); 
     1556        $string = substr( $string, $pos + 1 ); 
     1557        $string_nullspace = substr( $string_nullspace, $pos + 1 ); 
     1558    } 
     1559 
     1560    if ( $string ) { 
     1561        $chunks[] = $string; 
     1562    } 
     1563 
     1564    return $chunks; 
    14751565} 
    14761566 
Note: See TracChangeset for help on using the changeset viewer.