WordIndex.vb
  1. ''
  2. '' This code is part of Document Solutions for PDF demos.
  3. '' Copyright (c) MESCIUS inc. All rights reserved.
  4. ''
  5. Imports System.IO
  6. Imports System.Drawing
  7. Imports GrapeCity.Documents.Pdf
  8. Imports GrapeCity.Documents.Pdf.TextMap
  9. Imports GrapeCity.Documents.Text
  10. Imports GrapeCity.Documents.Common
  11. Imports GrapeCity.Documents.Pdf.Annotations
  12.  
  13. '' This sample loads an existing PDF, and imports a predefined list of key words,
  14. '' builds an alphabetical index of those words linked to pages where they occur
  15. '' in the document. The generated index pages are appended to the original document,
  16. '' and saved in a new PDF.
  17. '' The index is rendered in two balanced columns, imports the technique
  18. '' demonstrated in the BalancedColumns sample.
  19. ''
  20. '' NOTE: if you download this sample and run it locally on your own system
  21. '' without a valid DsPdf license, only the first five pages of the sample PDF
  22. '' will be loaded, and the index will be generated for those five pages only.
  23. Public Class WordIndex
  24.  
  25. '' Font collection to hold the fonts we need:
  26. Private _fc As FontCollection = New FontCollection()
  27. '' Font family used throughout this sample (this is not case-sensitive):
  28. Const _fontFamily = "segoe ui"
  29.  
  30. '' Main sample entry:
  31. Function CreatePDF(ByVal stream As Stream) As Integer
  32. '' Set up a font collection with the fonts we need:
  33. _fc.RegisterDirectory(Path.Combine("Resources", "Fonts"))
  34.  
  35. '' Get the PDF to add index to:
  36. Dim tfile = Path.Combine("Resources", "PDFs", "CompleteJavaScriptBook.pdf")
  37.  
  38. '' The list of words on which we will build the index:
  39. Dim words = _keywords.Distinct(StringComparer.InvariantCultureIgnoreCase).Where(Function(w_) Not String.IsNullOrEmpty(w_))
  40.  
  41. '' Load the PDF and add the index:
  42. Using fs = New FileStream(tfile, FileMode.Open, FileAccess.Read)
  43. Dim doc = New GcPdfDocument()
  44. doc.Load(fs)
  45. ''
  46. Dim origPageCount = doc.Pages.Count
  47. '' Build and add the index:
  48. AddWordIndex(doc, words)
  49. '' Open document on the first index page by default
  50. '' (may not work in browser viewers, but works in Acrobat):
  51. doc.OpenAction = New DestinationFit(origPageCount)
  52. '' Done:
  53. doc.Save(stream)
  54. Return doc.Pages.Count
  55. End Using
  56. End Function
  57.  
  58. '' The list of words to build the index on:
  59. Private ReadOnly _keywords() As String =
  60. {
  61. "JavaScript", "Framework", "MVC", "npm", "URL", "CDN", "HTML5", "CSS", "ES2015", "web",
  62. "Node.js", "API", "model", "view", "controller", "data management", "UI", "HTML",
  63. "API", "function", "var", "component", "design pattern", "React.js", "Angular", "AJAX",
  64. "DOM", "TypeScript", "ECMAScript", "CLI", "Wijmo", "CoffeeScript", "Elm",
  65. "plugin", "VueJS", "Knockout", "event", "AngularJS", "pure JS", "data binding", "OOP", "GrapeCity",
  66. "gauge", "JSX", "mobile", "desktop", "Vue", "template", "server-side", "client-side",
  67. "SPEC", "RAM", "ECMA"
  68. }
  69.  
  70. '' Calling FindText() on a document Or a page builds text maps for each page on the fly.
  71. '' Reusing cached text maps speeds things up a lot.
  72. Private Function FindTextPages(ByVal maps As ITextMap(), ByVal tp As FindTextParams) As SortedSet(Of Integer)
  73. Dim finds = New SortedSet(Of Integer)
  74. Dim currPageIdx = -1
  75. For Each map In maps
  76. currPageIdx = map.Page.Index
  77. map.FindText(tp, Function(fp_) finds.Add(currPageIdx))
  78. Next
  79. Return finds
  80. End Function
  81.  
  82. '' Adds a word index to the end of the passed document:
  83. Private Sub AddWordIndex(ByVal doc As GcPdfDocument, ByVal words As IEnumerable(Of String))
  84. Dim tStart = Util.TimeNow()
  85.  
  86. '' Build text maps for all pages to speed up FindText() calls
  87. Dim textMaps(doc.Pages.Count - 1) As ITextMap
  88. For i = 0 To doc.Pages.Count - 1
  89. textMaps(i) = doc.Pages(i).GetTextMap()
  90. Next
  91.  
  92. '' Words and page indices where they occur, sorted on words:
  93. Dim index = New SortedDictionary(Of String, List(Of Integer))()
  94.  
  95. '' Here the main loop building the index is on key words.
  96. '' An alternative would be to loop over the pages.
  97. '' Depending on the relative sizes of the keyword dictionary vs
  98. '' the number of pages in the document, one or the other might be better,
  99. '' but this is beyond the scope of this sample.
  100. For Each word In words
  101. Dim wholeWord As Boolean = word.IndexOf(" "c) = -1
  102. Dim pgs = FindTextPages(textMaps, New FindTextParams(word, wholeWord, False))
  103. '' A very simplistic way of also finding plurals:
  104. If wholeWord AndAlso Not word.EndsWith("s") Then
  105. pgs.UnionWith(FindTextPages(textMaps, New FindTextParams(word + "s", wholeWord, False)))
  106. End If
  107. If (pgs.Any()) Then
  108. index.Add(word, pgs.ToList())
  109. End If
  110. Next
  111.  
  112. '' Prepare to render the index. The whole index is built
  113. '' in a single TextLayout instance, set up to render it
  114. '' in two columns per page.
  115. '' The main rendering loop uses the TextLayout.SplitAndBalance method
  116. '' imports the approach demonstrated in BalancedColumns sample.
  117. '' The complication here is that we need to associate a link to the
  118. '' relevant page with each page number rendered, see linkIndices below.
  119. '' Set up the TextLayout:
  120. Const margin = 72.0F
  121. Dim pageWidth = doc.PageSize.Width
  122. Dim pageHeight = doc.PageSize.Height
  123. Dim cW = pageWidth - margin * 2
  124. '' Caption (index letter) format:
  125. Dim tfCap = New TextFormat() With {
  126. .FontName = _fontFamily,
  127. .FontBold = True,
  128. .FontSize = 16,
  129. .LineGap = 24
  130. }
  131. '' Index word and pages format:
  132. Dim tfRun = New TextFormat() With {
  133. .FontName = _fontFamily,
  134. .FontSize = 10
  135. }
  136. '' Page headers/footers:
  137. Dim tfHdr = New TextFormat() With {
  138. .FontName = _fontFamily,
  139. .FontItalic = True,
  140. .FontSize = 10
  141. }
  142. '' FirstLineIndent = -18 sets up hanging indent:
  143. Dim tl = New TextLayout(72) With {
  144. .FontCollection = _fc,
  145. .FirstLineIndent = -18,
  146. .MaxWidth = pageWidth,
  147. .MaxHeight = pageHeight,
  148. .MarginLeft = margin,
  149. .MarginRight = margin,
  150. .MarginBottom = margin,
  151. .MarginTop = margin,
  152. .ColumnWidth = cW * 0.46F,
  153. .TextAlignment = TextAlignment.Leading,
  154. .ParagraphSpacing = 4,
  155. .LineGapBeforeFirstLine = False
  156. }
  157.  
  158. '' The list of text runs created for page numbers:
  159. Dim pgnumRuns = New List(Of Tuple(Of TextRun, Integer))()
  160. '' This loop builds the index on the TextLayout, saving the text runs
  161. '' created for each page number rendered. Note that at this point
  162. '' (prior to the PerformLayout(true) call) the text runs do not contain any info
  163. '' about their code points and render locations, so we can only save the text runs here.
  164. '' Later they will be used to add links to referenced pages in the PDF:
  165. Dim litera As Char = " "
  166. For Each kvp In index
  167. Dim word = kvp.Key
  168. Dim pageIndices = kvp.Value
  169. If Char.ToUpper(word(0)) <> litera Then
  170. litera = Char.ToUpper(word(0))
  171. tl.Append($"{litera}{ChrW(&H2029)}", tfCap)
  172. End If
  173. tl.Append(word, tfRun)
  174. tl.Append(" ", tfRun)
  175. For i = 0 To pageIndices.Count - 1
  176. Dim from_ = pageIndices(i)
  177. Dim tr = tl.Append((from_ + 1).ToString(), tfRun)
  178. pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, from_))
  179. '' We merge sequential pages into "..-M":
  180. Dim k = i
  181. For j = i + 1 To pageIndices.Count - 1
  182. If pageIndices(j) <> pageIndices(j - 1) + 1 Then
  183. Exit For
  184. End If
  185. k = j
  186. Next
  187. If (k > i + 1) Then
  188. tl.Append("-", tfRun)
  189. Dim to_ = pageIndices(k)
  190. tr = tl.Append((to_ + 1).ToString(), tfRun)
  191. pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, to_))
  192. '' Fast forward:
  193. i = k
  194. End If
  195. If (i < pageIndices.Count - 1) Then
  196. tl.Append(", ", tfRun)
  197. Else
  198. tl.AppendLine(tfRun)
  199. End If
  200. Next
  201. Next
  202. '' This calculates the glyphs and lays out the whole index.
  203. '' The tl.SplitAndBalance() call in the loop below does not require redoing the layout:
  204. tl.PerformLayout(True)
  205.  
  206. ''
  207. '' Now we are ready to split and render the text layout, and also add links to page numbers.
  208. ''
  209.  
  210. '' Split areas and options - see BalancedColumns for details:
  211. Dim psas() As PageSplitArea = {
  212. New PageSplitArea(tl) With {.MarginLeft = tl.MarginLeft + (cW * 0.54F)}
  213. }
  214. Dim tso = New TextSplitOptions(tl) With {
  215. .KeepParagraphLinesTogether = True
  216. }
  217.  
  218. '' First original code point index in the current column:
  219. Dim cpiStart = 0
  220. '' Max+1 original code point index in the current column:
  221. Dim cpiEnd = 0
  222. '' Current index in pgnumRuns:
  223. Dim pgnumRunsIdx = 0
  224.  
  225. '' Method to add links to actual pages over page numbers in the current column:
  226. Dim linkIndices As Action(Of TextLayout, Page) =
  227. Sub(tl_, page_)
  228. cpiEnd += tl_.CodePointCount
  229. While pgnumRunsIdx < pgnumRuns.Count
  230. Dim run = pgnumRuns(pgnumRunsIdx)
  231. Dim textRun = run.Item1
  232. Dim cpi = textRun.CodePointIndex
  233. If cpi >= cpiEnd Then
  234. Exit While
  235. End If
  236. cpi -= cpiStart
  237. Dim rects = tl_.GetTextRects(cpi, textRun.CodePointCount)
  238. Debug.Assert(rects.Count > 0)
  239. page_.Annotations.Add(New LinkAnnotation(rects(0).ToRectangleF(), New DestinationFit(run.Item2)))
  240. pgnumRunsIdx += 1
  241. End While
  242. cpiStart += tl_.CodePointCount
  243. End Sub
  244.  
  245. '' Split and render the index in 2 columns:
  246. Dim page = doc.Pages.Add()
  247. While True
  248. Dim g = Page.Graphics
  249. '' Add a simple page header:
  250. g.DrawString($"Index generated by DsPdf on {tStart:R}", tfHdr,
  251. New RectangleF(margin, 0, pageWidth - margin * 2, margin),
  252. TextAlignment.Center, ParagraphAlignment.Center, False)
  253. '' 'rest' will accept the text that did not fit on this page:
  254. Dim rest As TextLayout = Nothing
  255. Dim splitResult = tl.SplitAndBalance(psas, tso, rest)
  256. '' Render text:
  257. g.DrawTextLayout(tl, PointF.Empty)
  258. g.DrawTextLayout(psas(0).TextLayout, PointF.Empty)
  259. '' Add links from page numbers to pages:
  260. linkIndices(tl, page)
  261. linkIndices(psas(0).TextLayout, page)
  262. '' Are we done yet?
  263. If splitResult <> SplitResult.Split Then
  264. Exit While
  265. End If
  266. tl = rest
  267. page = doc.Pages.Add()
  268. End While
  269. '' Done:
  270. End Sub
  271.  
  272. '' Creates a sample document with 100 pages of 'lorem ipsum':
  273. Private Function MakeDocumentToIndex() As String
  274. Const N = 100
  275. Dim tfile = Path.GetTempFileName()
  276. Using fsOut = New FileStream(tfile, FileMode.Open, FileAccess.ReadWrite)
  277. Dim tdoc = New GcPdfDocument()
  278. '' See StartEndDoc for details on StartDoc/EndDoc mode:
  279. tdoc.StartDoc(fsOut)
  280. '' Prep a TextLayout to hold/format the text:
  281. Dim tl = New TextLayout(72)
  282. tl.FontCollection = _fc
  283. tl.DefaultFormat.FontName = _fontFamily
  284. tl.DefaultFormat.FontSize = 12
  285. '' Use TextLayout to layout the whole page including margins:
  286. tl.MaxHeight = tdoc.PageSize.Height
  287. tl.MaxWidth = tdoc.PageSize.Width
  288. tl.MarginAll = 72
  289. tl.FirstLineIndent = 72 / 2
  290. '' Generate the document:
  291. For pageIdx = 0 To N - 1
  292. tl.Append(Util.LoremIpsum(1))
  293. tl.PerformLayout(True)
  294. tdoc.NewPage().Graphics.DrawTextLayout(tl, PointF.Empty)
  295. tl.Clear()
  296. Next
  297. tdoc.EndDoc()
  298. End Using
  299. Return tfile
  300. End Function
  301. End Class
  302.