Stripping Tags from HTML in C#

It's a common requirement to strip tags from HTML documents whether you are trying to extract plain text from HTML, or remove tags users have entered in text input fields or textarea fields on forms.

The following HTML utility class demonstrates how to strip the tags from HTML documents in order to extract the plain text.

You can download this utility for stripping HTML tags in C# at the end of this post.

The code demonstrates some of the features of our free LINQ to HTML library. It shows how powerful the LINQ to HTML library is at manipulating HTML. The code includes logic to ignore script and style tags; if you wanted, you could add additional code to insert carriage returns for p and br tags.

The main method you would call to strip HTML tags is GetPlainText.

public static class HtmlUtility
{
   public static string GetPlainText(string html)
   {
      HDocument document = HDocument.Parse(html);

      StringBuilder builder = new StringBuilder();

      GetPlainText(builder, document.Nodes());

      return CleanUpWhiteSpace(HttpUtility.HtmlDecode(builder.ToString()));
   }

   private static string CleanUpWhiteSpace(string value)
   {
      StringBuilder builder = new StringBuilder();

      bool isWhiteSpace = true;

      foreach (char c in value)
      {
         if (Char.IsWhiteSpace(c))
         {
            if (!isWhiteSpace)
            {
               builder.Append(" ");
            }


            isWhiteSpace = true;
         }
         else
         {
            builder.Append(c);

            isWhiteSpace = false;
         }
      }

      return builder.ToString();
   }

   private static void GetPlainText(StringBuilder builder, IEnumerable<HNode> nodes)
   {
      foreach (HNode node in nodes)
      {
         HElement element = node as HElement;

         if (element != null)
         {
            if (element.Name != "script" && element.Name != "style")
            {
               GetPlainText(builder, element.Nodes());
            }
         }
         else
         {
            HText text = node as HText;

            if (text != null)
            {
               builder.Append(text);
            }
            else
            {
               HEntity entity = node as HEntity;

               if (entity != null)
               {
                  builder.Append(entity.Value);
               }
            }
         }
      }
   }
}