Option Compare Database
Option Explicit
Const PageUrl = "https://barnamenevis.org/forumdisplay.php?23-Access/page"
Private Document As HTMLDocument
Private PagesCount As Integer
Private RS_threads As Recordset
Private temp_object, temp_array As Variant
Private StopProcess As Boolean
Sub Scrape_Forum_Pages()
'DoCmd.RunSQL "DELETE * FROM Threads"
Init
Dim PageNumber As Integer: PageNumber = 1
PagesCount = 1
StopProcess = False
Set RS_threads = CurrentDb.OpenRecordset("Threads", dbOpenDynaset)
Do While PageNumber <= PagesCount And (Not StopProcess)
Scrape_Forum_Page (PageNumber)
PageNumber = PageNumber + 1
Loop
RS_threads.Close
Set RS_threads = Nothing
Set HTML = Nothing
Scrape_New_Or_Updated_Threads
MsgBox ("Access Forum Scraping Done!")
End Sub
Sub Scrape_Forum_Page(PageNumber As Integer)
Set Document = GetDocument(PageUrl + Trim(PageNumber))
If PageNumber = 1 Then
GetForumPagesCount
ScrapePageThreads ("stickies") ' sticky threads
End If
ScrapePageThreads ("threads") ' normal threads
Set Document = Nothing
Debug.Print "Page " & PageNumber & " done! " & Time
End Sub
Sub ScrapePageThreads(ThreadsID As String)
Dim Threads As HTMLListElement
Dim Thread As HTMLListElement
Set Threads = Document.getElementById(ThreadsID)
StopProcess = False
For Each Thread In Threads.Children
If StopProcess Then
Exit For
Else
GetThreadInfo Thread
End If
Next
Set Threads = Nothing
Set Thread = Nothing
End Sub
Sub GetThreadInfo(ByRef Thread As HTMLLIElement)
Dim ThreadID, Title, Author, CreateTime, UpdateTime, Tags As String
Dim Replies, Attachments As Integer
Dim img As HTMLImg
Dim span As HTMLSpanElement
temp_object = Trim(Thread.querySelector(".threadstats > li").innerText)
If temp_object = vbNullString Then Exit Sub ' Moved Thread
temp_array = Split(temp_object, blank)
Replies = CInt(temp_array(1))
temp_array = Split(Thread.Id, "_")
ThreadID = temp_array(1)
Title = Document.getElementById("thread_title_" & ThreadID).innerText
Set span = Thread.querySelector(".author .label")
CreateTime = GetTime(span.childNodes.Item(2).Data)
Author = span.querySelector("a").innerText
Set img = NzImg(Thread.querySelector(".author .threaddetails img[src*='tag']"))
If img Is Nothing Then
Tags = vbNullString
Else
Tags = img.Title
End If
Set img = NzImg(Thread.querySelector(".author .threaddetails img[src*='paperclip']"))
If img Is Nothing Then
Attachments = 0
Else
temp_array = Split(img.Title, blank)
Attachments = CInt(temp_array(0))
End If
UpdateTime = GetTime(Thread.querySelector(".threadlastpost>dd:l ast-of-type").innerText)
With RS_threads
.FindFirst "ThreadID=" & ThreadID
If .NoMatch Then ' New Thread
.AddNew
!ThreadID = CLng(ThreadID)
!Title = Title
!Tags = Tags
!Author = Author
!CreateTime = CreateTime
!UpdateTime = UpdateTime
!Replies = Replies
!Attachments = Attachments
!UpdateRequired = True
.Update
Else ' Old Thread
If !UpdateTime = UpdateTime Then ' Unchanged Old Thread
StopProcess = True
Else ' Old Thread - Updated or New Post
.Edit
!Title = Title
!Tags = Tags
!UpdateTime = UpdateTime
!Replies = Replies
!Attachments = Attachments
!UpdateRequired = True
.Update
End If
End If
End With
Set img = Nothing
Set span = Nothing
End Sub
Sub GetForumPagesCount()
Dim a As HTMLAnchorElement
Set a = Document.querySelector(".threadpagenav span.first_last > a")
If a Is Nothing Then
PagesCount = 1
Else
temp_array = Split(a.href, "/page")
PagesCount = CInt(temp_array(1))
End If
Set a = Nothing
End Sub
Function NzImg(ByRef v As Variant) As HTMLImg
If IsNull(v) Then
Set NzImg = Nothing
Else
Set NzImg = v
End If
End Function