HtmlTableReader Class for VB.Net (Using HTML Agility Pack)

Posted: 09/14/2011

I love the IDataReader interface. To me it’s just an easy/quick way to iterate over data when you know what that data is going to be. The ease of binding it to ASP.NET Webform and Winform grids (plus many extensions I have for them to export to excel and text files) makes them extremely efficient useful for me. I’ve created a few recently that I’ll share over a blog post or two. This one, takes an HTML table (provided as a string) and turns its contents into a DataReader. The one stipulation is that the table must have the same amount of columns on every row (having a colspan attribute on one row will mess it up). I may compensate for that in the future, but I don’t have a need to right now. I did this in about 10 minutes, so it maybe a little sloppy but it’s worked for the few tests I’ve thrown at it. This class makes use of the HTML Agility Pack to parse the HTML (highly useful and I highly recommend it if you plan on doing any HTML parsing). The HTML Agility Pack is open source and you can download the code or the library at: http://htmlagilitypack.codeplex.com/

Imports System.Data
Imports System.Net
Imports System.Collections
Imports System.Collections.Specialized
Imports System.Text
Imports System.Text.RegularExpressions
Imports HtmlAgilityPack
Imports Iuf.Extensions
Namespace Argus.Data
    ''' <summary>
    ''' A class that reads an HTML table and returns it as a DataReader.  This class is depended on and coded against 
    ''' the HtmlAgilityPack.  This will require that the table has a consistent cell count in each row, it does not handle
    ''' tables with the colspan that doesn't exist on every row.
    ''' </summary>
    ''' <remarks>
    ''' </remarks>
    Public Class HtmlTableReader
        Implements IDataReader

        '*********************************************************************************************************************
        '
        '             Class:  HtmlTableReader
        '      Initial Date:  09/13/2011
        '      Last Updated:  09/14/2011
        '     Programmer(s):  Blake Pell, bpell@indiana.edu
        '
        '*********************************************************************************************************************        

        Public Sub New(ByVal html As String, ByVal firstRowContainsHeader As Boolean)
            Me.Html = html
            Dim hd As New HtmlDocument()
            hd.LoadHtml(html)
            If hd.DocumentNode.SelectNodes("//table").Count = 0 Then
                Exit Sub
            End If
            Dim table As HtmlNode = hd.DocumentNode.SelectNodes("//table")(0)
            Dim rowCount As Integer = 0
            Dim columnCount As Integer = 0
            Dim dt As New DataTable("HtmlTableReader")
            ' We're only going to parse the first table in the case of multiple tables or nested tables.  Considering creating a DataSet
            ' or multi-reader for support of nested tables at the same time.
            For Each tr As HtmlNode In table.SelectNodes("//tr")
                Dim cells As HtmlNodeCollection = tr.SelectNodes("td")
                If firstRowContainsHeader = True And rowCount = 0 Then
                    For Each td As HtmlNode In cells
                        dt.Columns.Add(td.InnerText, System.Type.GetType("System.String"))
                    Next
                ElseIf firstRowContainsHeader = False And rowCount = 0 Then
                    Dim fieldCounter As Integer = 0
                    For Each td As HtmlNode In cells
                        fieldCounter += 1
                        dt.Columns.Add("Column" & fieldCounter.ToString, System.Type.GetType("System.String"))
                    Next
                Else
                    Dim fields(cells.Count - 1) As String
                    columnCount = 0
                    For Each td As HtmlNode In cells
                        fields(columnCount) = td.InnerText
                        columnCount += 1
                    Next
                    dt.Rows.Add(fields)
                End If
                rowCount += 1
                columnCount = 0
            Next
            Me.DataTable = dt
            Me.DataReader = dt.CreateDataReader
        End Sub
        Private _dataReader As DataTableReader
        ''' <summary>
        ''' The underlaying DataTableReader that we are wrapping.
        ''' </summary>
        ''' <value></value>
        ''' <returns></returns>
        ''' <remarks>
        ''' These methods are exposed via the implemented properties of this class and does not need
        ''' to be exposed itself.
        ''' </remarks>
        Private Property DataReader() As DataTableReader
            Get
                Return _dataReader
            End Get
            Set(ByVal value As DataTableReader)
                _dataReader = value
            End Set
        End Property
        Private _dataTable As DataTable
        ''' <summary>
        ''' The underlaying DataTable that is populated from the specified url.
        ''' </summary>
        ''' <value></value>
        ''' <returns></returns>
        ''' <remarks>
        ''' This has been left a private variable so that the DataTable isn't tampered with while
        ''' iterating over the DataReader.
        ''' </remarks>
        Private Property DataTable() As DataTable
            Get
                Return _dataTable
            End Get
            Set(ByVal value As DataTable)
                _dataTable = value
            End Set
        End Property
        Private _html As String = ""
        ''' <summary>
        ''' The html that represents the table.
        ''' </summary>
        ''' <value></value>
        ''' <returns></returns>
        ''' <remarks></remarks>
        Public Property Html() As String
            Get
                Return _html
            End Get
            Set(ByVal value As String)
                _html = value
            End Set
        End Property
        ''' <summary>
        ''' Resets the underlaying DataReader and creates a new one that is at the first position.
        ''' </summary>
        ''' <remarks></remarks>
        Public Sub MoveFirst()
            If Me.DataReader IsNot Nothing Then
                Me.DataReader.Close() : Me.DataReader = Nothing
            End If
            If Me.DataTable IsNot Nothing Then
                Me.DataReader = Me.DataTable.CreateDataReader
            End If
        End Sub
        ''' <summary>
        ''' Closes the DataReader
        ''' </summary>
        ''' <remarks></remarks>
        Public Sub Close() Implements System.Data.IDataReader.Close
            Me.DataReader.Close()
        End Sub
        ''' <summary>
        ''' The number of records in the DataReader.
        ''' </summary>
        ''' <returns></returns>
        ''' <remarks></remarks>
        Public Function RowCount() As Integer
            If Me.DataTable Is Nothing Then
                Return False
            End If
            Return Me.DataTable.Rows.Count
        End Function
        Public ReadOnly Property Depth() As Integer Implements System.Data.IDataReader.Depth
            Get
                Return Me.DataReader.Depth
            End Get
        End Property
        Public Function GetSchemaTable() As System.Data.DataTable Implements System.Data.IDataReader.GetSchemaTable
            Return Me.DataReader.GetSchemaTable
        End Function
        Public ReadOnly Property IsClosed() As Boolean Implements System.Data.IDataReader.IsClosed
            Get
                Return Me.DataReader.IsClosed
            End Get
        End Property
        Public Function NextResult() As Boolean Implements System.Data.IDataReader.NextResult
            Return Me.DataReader.NextResult
        End Function
        Public Function Read() As Boolean Implements System.Data.IDataReader.Read
            Return Me.DataReader.Read
        End Function
        Public ReadOnly Property RecordsAffected() As Integer Implements System.Data.IDataReader.RecordsAffected
            Get
                Return Me.DataReader.RecordsAffected
            End Get
        End Property
        Public ReadOnly Property FieldCount() As Integer Implements System.Data.IDataRecord.FieldCount
            Get
                Return Me.DataReader.FieldCount
            End Get
        End Property
        Public Function GetBoolean(ByVal i As Integer) As Boolean Implements System.Data.IDataRecord.GetBoolean
            Return Me.DataReader.GetBoolean(i)
        End Function
        Public Function GetByte(ByVal i As Integer) As Byte Implements System.Data.IDataRecord.GetByte
            Return Me.DataReader.GetByte(i)
        End Function
        Public Function GetBytes(ByVal i As Integer, ByVal fieldOffset As Long, ByVal buffer() As Byte, ByVal bufferoffset As Integer, ByVal length As Integer) As Long Implements System.Data.IDataRecord.GetBytes
            Return Me.DataReader.GetBytes(i, fieldOffset, buffer, bufferoffset, length)
        End Function
        Public Function GetChar(ByVal i As Integer) As Char Implements System.Data.IDataRecord.GetChar
            Return Me.DataReader.GetChar(i)
        End Function
        Public Function GetChars(ByVal i As Integer, ByVal fieldoffset As Long, ByVal buffer() As Char, ByVal bufferoffset As Integer, ByVal length As Integer) As Long Implements System.Data.IDataRecord.GetChars
            Return Me.DataReader.GetChars(i, fieldoffset, buffer, bufferoffset, length)
        End Function
        Public Function GetData(ByVal i As Integer) As System.Data.IDataReader Implements System.Data.IDataRecord.GetData
            Throw New NotImplementedException
        End Function
        Public Function GetDataTypeName(ByVal i As Integer) As String Implements System.Data.IDataRecord.GetDataTypeName
            Return Me.DataReader.GetDataTypeName(i)
        End Function
        Public Function GetDateTime(ByVal i As Integer) As Date Implements System.Data.IDataRecord.GetDateTime
            Return Me.DataReader.GetDataTypeName(i)
        End Function
        Public Function GetDecimal(ByVal i As Integer) As Decimal Implements System.Data.IDataRecord.GetDecimal
            Return Me.DataReader.GetDecimal(i)
        End Function
        Public Function GetDouble(ByVal i As Integer) As Double Implements System.Data.IDataRecord.GetDouble
            Return Me.DataReader.GetDouble(i)
        End Function
        Public Function GetFieldType(ByVal i As Integer) As System.Type Implements System.Data.IDataRecord.GetFieldType
            Return Me.DataReader.GetFieldType(i)
        End Function
        Public Function GetFloat(ByVal i As Integer) As Single Implements System.Data.IDataRecord.GetFloat
            Return Me.DataReader.GetFloat(i)
        End Function
        Public Function GetGuid(ByVal i As Integer) As System.Guid Implements System.Data.IDataRecord.GetGuid
            Return Me.DataReader.GetGuid(i)
        End Function
        Public Function GetInt16(ByVal i As Integer) As Short Implements System.Data.IDataRecord.GetInt16
            Return Me.DataReader.GetInt16(i)
        End Function
        Public Function GetInt32(ByVal i As Integer) As Integer Implements System.Data.IDataRecord.GetInt32
            Return Me.DataReader.GetInt32(i)
        End Function
        Public Function GetInt64(ByVal i As Integer) As Long Implements System.Data.IDataRecord.GetInt64
            Return Me.DataReader.GetInt64(i)
        End Function
        Public Function GetName(ByVal i As Integer) As String Implements System.Data.IDataRecord.GetName
            Return Me.DataReader.GetName(i)
        End Function
        Public Function GetOrdinal(ByVal name As String) As Integer Implements System.Data.IDataRecord.GetOrdinal
            Return Me.DataReader.GetOrdinal(name)
        End Function
        Public Function GetString(ByVal i As Integer) As String Implements System.Data.IDataRecord.GetString
            Return Me.DataReader.GetString(i)
        End Function
        Public Function GetValue(ByVal i As Integer) As Object Implements System.Data.IDataRecord.GetValue
            Return Me.DataReader.GetValue(i)
        End Function
        Public Function GetValues(ByVal values() As Object) As Integer Implements System.Data.IDataRecord.GetValues
            Return Me.DataReader.GetValues(values)
        End Function
        Public Function IsDBNull(ByVal i As Integer) As Boolean Implements System.Data.IDataRecord.IsDBNull
            Return Me.DataReader.IsDBNull(i)
        End Function
        Default Public Overloads ReadOnly Property Item(ByVal i As Integer) As Object Implements System.Data.IDataRecord.Item
            Get
                Return Me.DataReader.Item(i)
            End Get
        End Property
        Default Public Overloads ReadOnly Property Item(ByVal name As String) As Object Implements System.Data.IDataRecord.Item
            Get
                Return Me.DataReader.Item(name)
            End Get
        End Property
        Private disposedValue As Boolean = False        ' To detect redundant calls
        ' IDisposable
        Protected Overridable Sub Dispose(ByVal disposing As Boolean)
            If Not Me.disposedValue Then
                If disposing Then
                    Iuf.Data.DatabaseUtils.CleanupDbResources(Nothing, Me.DataReader, Nothing, Nothing, Me.DataTable)
                    ' TODO: free other state (managed objects).
                End If
                ' TODO: free your own state (unmanaged objects).
                ' TODO: set large fields to null.
            End If
            Me.disposedValue = True
        End Sub
#Region " IDisposable Support "
        ' This code added by Visual Basic to correctly implement the disposable pattern.
        Public Sub Dispose() Implements IDisposable.Dispose
            ' Do not change this code.  Put cleanup code in Dispose(ByVal disposing As Boolean) above.
            Dispose(True)
            GC.SuppressFinalize(Me)
        End Sub
#End Region
    End Class
End Namespace