Truncating Text and HTML

Categories: blog



code_montage.png

Of all the applications that I built, truncating text with HTML is crucial. The last thing I need is an entry that takes up half of the page and makes it looks absolutely ugly (I could put a read more on my posts but sometimes I just forget and my posts and not that long).

Here's a function that I use for truncating text with HTML (texts without HTML also works). If I remember correctly its a snippet from cakephp.

Update: Seems the Text Editor doesn't play nice. Here's the file.

 

PHP:
  1. function truncate($text, $length = 100, $ending = '...', $exact = false, $considerHtml = true) {
  2.     if ($considerHtml) {
  3.         // if the plain text is shorter than the maximum length, return the whole text
  4.         if (strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
  5.             return $text;
  6.         }
  7.        
  8.         // splits all html-tags to scanable lines
  9.         preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
  10.  
  11.         $total_length = strlen($ending);
  12.         $open_tags = array();
  13.         $truncate = '';
  14.        
  15.         foreach ($lines as $line_matchings) {
  16.             // if there is any html-tag in this line, handle it and add it (uncounted) to the output
  17.             if (!empty($line_matchings[1])) {
  18.                 // if it's an "empty element" with or without xhtml-conform closing slash (f.e. <br/>)
  19.                 if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
  20.                     // do nothing
  21.                 // if tag is a closing tag (f.e. </strong>)
  22.                 } else if (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
  23.                     // delete tag from $open_tags list
  24.                     $pos = array_search($tag_matchings[1], $open_tags);
  25.                     if ($pos !== false) {
  26.                         unset($open_tags[$pos]);
  27.                     }
  28.                 // if tag is an opening tag (f.e. <strong>)
  29.                 } else if (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
  30.                     // add tag to the beginning of $open_tags list
  31.                     array_unshift($open_tags, strtolower($tag_matchings[1]));
  32.                 }
  33.                 // add html-tag to $truncate'd text
  34.                 $truncate .= $line_matchings[1];
  35.             }
  36.            
  37.             // calculate the length of the plain text part of the line; handle entities as one character
  38.             $content_length = strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
  39.             if ($total_length+$content_length> $length) {
  40.                 // the number of characters which are left
  41.                 $left = $length - $total_length;
  42.                 $entities_length = 0;
  43.                 // search for html entities
  44.                 if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|&#x[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
  45.                     // calculate the real length of all entities in the legal range
  46.                     foreach ($entities[0] as $entity) {
  47.                         if ($entity[1]+1-$entities_length <= $left) {
  48.                             $left--;
  49.                             $entities_length += strlen($entity[0]);
  50.                         } else {
  51.                             // no more characters left
  52.                             break;
  53.                         }
  54.                     }
  55.                 }
  56.                 $truncate .= substr($line_matchings[2], 0, $left+$entities_length);
  57.                 // maximum lenght is reached, so get off the loop
  58.                 break;
  59.             } else {
  60.                 $truncate .= $line_matchings[2];
  61.                 $total_length += $content_length;
  62.             }
  63.            
  64.             // if the maximum length is reached, get off the loop
  65.             if($total_length>= $length) {
  66.                 break;
  67.             }
  68.         }
  69.     } else {
  70.         if (strlen($text) <= $length) {
  71.             return $text;
  72.         } else {
  73.             $truncate = substr($text, 0, $length - strlen($ending));
  74.         }
  75.     }
  76.    
  77.     // if the words shouldn't be cut in the middle...
  78.     if (!$exact) {
  79.         // ...search the last occurance of a space...
  80.         $spacepos = strrpos($truncate, ' ');
  81.         if (isset($spacepos)) {
  82.             // ...and cut the text in this position
  83.             $truncate = substr($truncate, 0, $spacepos);
  84.         }
  85.     }
  86.    
  87.     // add the defined ending to the text
  88.     $truncate .= $ending;
  89.    
  90.     if($considerHtml) {
  91.         // close all unclosed html-tags
  92.         foreach ($open_tags as $tag) {
  93.             $truncate .= '</' . $tag . '>';
  94.         }
  95.     }
  96.    
  97.     return $truncate;
  98.    
  99. }

3 Comments »

RSS feed for comments on this post. TrackBack URI

  1. It would be monumentally helpful if the less than signs and other parts of the code weren’t converted to < etc.

    Comment by Mike — April 28, 2008 #

  2. Sorry about that dude, I’ve posted the file.

    Comment by mkhairul — April 29, 2008 #

  3. I do not believe this

    Comment by fornetti — August 31, 2008 #

Leave a comment

XHTML: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>

Powered by WordPress with GimpStyle Theme design by Horacio Bella.
Entries and comments feeds. Valid XHTML and CSS.