' filey = "*your full file name including directory here*" AcroExchApp = CreateObject("AcroExch.App") AcroExchAVDoc = CreateObject("AcroExch.AVDoc") ' Open the [strfiley] pdf file AcroExchAVDoc.Open(filey,"") ' Get the PDDoc associated with the open AVDoc AcroExchPDDoc = AcroExchAVDoc.GetPDDoc sustext = "accessorizes" suktext = "accessorises" ' get JavaScript Object ' note jso is related to PDDoc of a PDF,jso = AcroExchPDDoc.GetJSObject ' count nCount = 0 nCount1 = 0 gbStop = False bUSCnt = False bUKCnt = False ' search for the text If Not jso Is Nothing Then ' total number of pages nPages = jso.numpages ' Go through pages For i = 0 To nPages - 1 ' check each word in a page nWords = jso.getPageNumWords(i) For j = 0 To nWords - 1 ' get a word word = Trim(CStr(jso.getPageNthWord(i,j))) 'If VarType(word) = VariantType.String Then If word <> "" Then ' compare the word with what the user wants If Trim(sustext) <> "" Then result = StrComp(word,sustext,vbTextCompare) ' if same If result = 0 Then nCount = nCount + 1 If bUSCnt = False Then iUSCnt = iUSCnt + 1 bUSCnt = True End If End If End If If suktext<> "" Then result1 = StrComp(word,suktext,vbTextCompare) ' if same If result1 = 0 Then nCount1 = nCount1 + 1 If bUKCnt = False Then iUKCnt = iUKCnt + 1 bUKCnt = True End If End If End If End If Next j Next i jso = Nothing End If
For i = 0 To nPages - 1 ' check each word in a page nWords = jso.getPageNumWords(i) For j = 0 To nWords - 1 ' get a word word = Trim(CStr(jso.getPageNthWord(i,j)))
- Opens .pdf
- Read each text page
- using regular expression find matches
- save them to a list of strings eliminating duplicates
- for each string in this list search page and highlight the word
Dim pdf As PdfDocument = New PdfDocument("Path") Dim pattern As String = "([A-Z,0-9]{3}[-][A-Z,0-9]{3})" Dim matches As MatchCollection Dim result As PdfTextFind() = Nothing Dim content As New StringBuilder() Dim matchList As New List(Of String) For Each page As PdfPageBase In pdf.Pages 'get text from current page content.Append(page.ExtractText()) 'find matches matches = Regex.Matches(content.ToString,pattern,RegexOptions.None) matchList.Clear() 'Assign each match to a string list. For Each match As Match In matches matchList.Add(match.Value) Next 'Eliminate duplicates. matchList = matchList.Distinct.ToList 'for each string in list For i = 0 To matchList.Count - 1 'find all occurances of matchList(i) string in page and highlight it result = page.FindText(matchList(i)).Finds For Each find As PdfTextFind In result find.ApplyHighLight(Color.BlueViolet) 'you can set your color preference Next Next 'matchList Next 'page pdf.SaveToFile("New Path") pdf.Close() pdf.Dispose()