HTML sanitization for ASP.Net

I spent a great deal of time trying to find a reasonable way to clean HTML. I wanted to remove script tags, broken HTML, and etc but still allow rich text editing. Most of the sanitization routines are just too strict. My input is from a ckeditor so the users have a fair number of formatting options that I want to retain.

This came from a bunch of sources and then was converted to vb.net and tweaked a little by me. 

To use it you would call 

Sanitize("your messy HTML")

This will strip tags not in the whitelist and balance any opened but not closed tags. 

It requires the HTML Agility Pack - http://nuget.org/packages/HtmlAgilityPack


Public NotInheritable Class HtmlUtility

    ''' <summary>
    ''' Removes ALL html from a strung
    ''' </summary>
    ''' <param name="source"></param>
    ''' <returns></returns>
    ''' <remarks></remarks>
    Public Shared Function StripHTML(source As String)
        If source = String.Empty Then Return String.Empty
        Dim doc = New HtmlAgilityPack.HtmlDocument()
        Return doc.DocumentNode.InnerText
    End Function

    ''' <summary>
    ''' Takes raw HTML input and cleans against a whitelist
    ''' </summary>
    ''' <param name="source">Html source</param>
    ''' <returns>Clean output</returns>
    Public Shared Function SanitizeHtml(source As String) As String
        If source = String.Empty Then Return String.Empty
        source = Sanitize(source)
        source = BalanceTags(source)
        Return source
    End Function

    Private Shared _namedtags As New Regex("</?(?<tagname>\w+)[^>]*(\s|$|>)", RegexOptions.Singleline Or RegexOptions.ExplicitCapture Or RegexOptions.Compiled)

    ''' <summary> 
    ''' http://refactormycode.com/codes/360-balance-html-tags
    ''' attempt to balance HTML tags in the html string 
    ''' by removing any unmatched opening or closing tags 
    ''' IMPORTANT: we *assume* HTML has *already* been  
    ''' sanitized and is safe/sane before balancing! 
    ''' CODESNIPPET: A8591DBA-D1D3-11DE-947C-BA5556D89593 
    ''' </summary> 
    Public Shared Function BalanceTags(html As String) As String
        If [String].IsNullOrEmpty(html) Then
            Return html
        End If

        ' convert everything to lower case; this makes 
        ' our case insensitive comparisons easier 
        Dim tags As MatchCollection = _namedtags.Matches(html.ToLowerInvariant())

        ' no HTML tags present? nothing to do; exit now 
        Dim tagcount As Integer = tags.Count
        If tagcount = 0 Then
            Return html
        End If

        Dim tagname As String
        Dim tag As String
        Const ignoredtags As String = "<p><img><br><li><hr><input>"
        Dim match As Integer
        Dim tagpaired = New Boolean(tagcount - 1) {}
        Dim tagremove = New Boolean(tagcount - 1) {}

        ' loop through matched tags in forward order 
        For ctag As Integer = 0 To tagcount - 1
            tagname = tags(ctag).Groups("tagname").Value

            ' skip any already paired tags 
            ' and skip tags in our ignore list; assume they're self-closed 
            If tagpaired(ctag) OrElse ignoredtags.Contains("<" & tagname & ">") Then
                Continue For
            End If

            tag = tags(ctag).Value
            match = -1

            If tag.StartsWith("</") Then
                ' this is a closing tag 
                ' search backwards (previous tags), look for opening tags 
                For ptag As Integer = ctag - 1 To 0 Step -1
                    Dim prevtag As String = tags(ptag).Value
                    If Not tagpaired(ptag) AndAlso prevtag.Equals("<" & tagname, StringComparison.InvariantCulture) Then
                        ' minor optimization; we do a simple possibly incorrect match above 
                        ' the start tag must be <tag> or <tag{space} to match 
                        If prevtag.StartsWith("<" & tagname & ">") OrElse prevtag.StartsWith("<" & tagname & " ") Then
                            match = ptag
                            Exit For
                        End If
                    End If
                ' this is an opening tag 
                ' search forwards (next tags), look for closing tags 
                For ntag As Integer = ctag + 1 To tagcount - 1
                    If Not tagpaired(ntag) AndAlso tags(ntag).Value.Equals("</" & tagname & ">", StringComparison.InvariantCulture) Then
                        match = ntag
                        Exit For
                    End If
            End If

            ' we tried, regardless, if we got this far 
            tagpaired(ctag) = True
            If match = -1 Then
                tagremove(ctag) = True
                ' mark for removal 
                tagpaired(match) = True
                ' mark paired 
            End If

        ' loop through tags again, this time in reverse order 
        ' so we can safely delete all orphaned tags from the string 
        For ctag As Integer = tagcount - 1 To 0 Step -1
            If tagremove(ctag) Then
                html = html.Remove(tags(ctag).Index, tags(ctag).Length)
                System.Diagnostics.Debug.WriteLine("unbalanced tag removed: " & tags(ctag).ToString)
            End If

        Return html
    End Function

    Private Shared ReadOnly Whitelist As New Dictionary(Of String, String())() From { _
    {"p", New String() {"style", "class", "align"}}, _
    {"head", New String() {"style", "class", "align"}}, _
    {"body", New String() {"style", "class", "align"}}, _
    {"pre", New String() {"style", "class", "align"}}, _
    {"div", New String() {"style", "class", "align"}}, _
    {"span", New String() {"style", "class"}}, _
    {"br", New String() {"style", "class"}}, _
    {"hr", New String() {"style", "class"}}, _
    {"label", New String() {"style", "class"}}, _
    {"h1", New String() {"style", "class"}}, _
    {"h2", New String() {"style", "class"}}, _
    {"h3", New String() {"style", "class"}}, _
    {"h4", New String() {"style", "class"}}, _
    {"h5", New String() {"style", "class"}}, _
    {"h6", New String() {"style", "class"}}, _
    {"font", New String() {"style", "class", "color", "face", "size"}}, _
    {"strong", New String() {"style", "class"}}, _
    {"b", New String() {"style", "class"}}, _
    {"em", New String() {"style", "class"}}, _
    {"i", New String() {"style", "class"}}, _
    {"u", New String() {"style", "class"}}, _
    {"strike", New String() {"style", "class"}}, _
    {"ol", New String() {"style", "class"}}, _
    {"ul", New String() {"style", "class"}}, _
    {"li", New String() {"style", "class"}}, _
    {"blockquote", New String() {"style", "class"}}, _
    {"code", New String() {"style", "class"}}, _
    {"a", New String() {"style", "class", "href", "title", "target", "name"}}, _
    {"img", New String() {"style", "class", "src", "height", "width", "alt", "title", "hspace", "vspace", "border"}}, _
    {"table", New String() {"style", "class", "width", "cellpadding", "cellspacing", "align", "border"}}, _
    {"thead", New String() {"style", "class"}}, _
    {"tbody", New String() {"style", "class"}}, _
    {"tfoot", New String() {"style", "class"}}, _
    {"th", New String() {"style", "class", "scope"}}, _
    {"tr", New String() {"style", "class"}}, _
    {"td", New String() {"style", "class", "colspan"}}, _
    {"q", New String() {"style", "class", "cite"}}, _
    {"cite", New String() {"style", "class"}}, _
    {"abbr", New String() {"style", "class"}}, _
    {"acronym", New String() {"style", "class"}}, _
    {"del", New String() {"style", "class"}}, _
    {"ins", New String() {"style", "class"}}, _
    {"form", New String() {"style", "class", "method", "name", "action"}}, _
    {"iframe", New String() {"style", "class", "frameborder", "height", "width", "src", "allowfullscreen"}}, _
    {"input", New String() {"name", "type", "value", "class"}} _

    ''' <summary>
    ''' Strip tags not in whitelist
    ''' http://stackoverflow.com/questions/3107514/html-agility-pack-strip-tags-not-in-whitelist
    ''' </summary>
    ''' <param name="input"></param>
    ''' <returns></returns>
    ''' <remarks></remarks>
    Public Shared Function Sanitize(input As String) As String
        Dim htmlDocument = New HtmlDocument()


        Return htmlDocument.DocumentNode.WriteTo().Trim()
    End Function

    Private Shared Sub SanitizeChildren(parentNode As HtmlNode)
        For i As Integer = parentNode.ChildNodes.Count - 1 To 0 Step -1
    End Sub

    Private Shared Sub SanitizeNode(node As HtmlNode)
        If node.NodeType = HtmlNodeType.Element Then
            If Not Whitelist.ContainsKey(node.Name) Then
            End If

            If node.HasAttributes Then
                For i As Integer = node.Attributes.Count - 1 To 0 Step -1
                    Dim currentAttribute As HtmlAttribute = node.Attributes(i)
                    Dim allowedAttributes As String() = Whitelist(node.Name)
                    If Not allowedAttributes.Contains(currentAttribute.Name) Then
                    End If
                    If currentAttribute.Value.Contains("javascript") Then
                    End If
            End If
        End If

        If node.HasChildNodes Then
        End If
    End Sub
End Class

