WordIndex.vb
- ''
- '' This code is part of Document Solutions for PDF demos.
- '' Copyright (c) MESCIUS inc. All rights reserved.
- ''
- Imports System.IO
- Imports System.Drawing
- Imports GrapeCity.Documents.Pdf
- Imports GrapeCity.Documents.Pdf.TextMap
- Imports GrapeCity.Documents.Text
- Imports GrapeCity.Documents.Common
- Imports GrapeCity.Documents.Pdf.Annotations
-
- '' This sample loads an existing PDF, and imports a predefined list of key words,
- '' builds an alphabetical index of those words linked to pages where they occur
- '' in the document. The generated index pages are appended to the original document,
- '' and saved in a new PDF.
- '' The index is rendered in two balanced columns, imports the technique
- '' demonstrated in the BalancedColumns sample.
- ''
- '' NOTE: if you download this sample and run it locally on your own system
- '' without a valid DsPdf license, only the first five pages of the sample PDF
- '' will be loaded, and the index will be generated for those five pages only.
- Public Class WordIndex
-
- '' Font collection to hold the fonts we need:
- Private _fc As FontCollection = New FontCollection()
- '' Font family used throughout this sample (this is not case-sensitive):
- Const _fontFamily = "segoe ui"
-
- '' Main sample entry:
- Function CreatePDF(ByVal stream As Stream) As Integer
- '' Set up a font collection with the fonts we need:
- _fc.RegisterDirectory(Path.Combine("Resources", "Fonts"))
-
- '' Get the PDF to add index to:
- Dim tfile = Path.Combine("Resources", "PDFs", "CompleteJavaScriptBook.pdf")
-
- '' The list of words on which we will build the index:
- Dim words = _keywords.Distinct(StringComparer.InvariantCultureIgnoreCase).Where(Function(w_) Not String.IsNullOrEmpty(w_))
-
- '' Load the PDF and add the index:
- Using fs = New FileStream(tfile, FileMode.Open, FileAccess.Read)
- Dim doc = New GcPdfDocument()
- doc.Load(fs)
- ''
- Dim origPageCount = doc.Pages.Count
- '' Build and add the index:
- AddWordIndex(doc, words)
- '' Open document on the first index page by default
- '' (may not work in browser viewers, but works in Acrobat):
- doc.OpenAction = New DestinationFit(origPageCount)
- '' Done:
- doc.Save(stream)
- Return doc.Pages.Count
- End Using
- End Function
-
- '' The list of words to build the index on:
- Private ReadOnly _keywords() As String =
- {
- "JavaScript", "Framework", "MVC", "npm", "URL", "CDN", "HTML5", "CSS", "ES2015", "web",
- "Node.js", "API", "model", "view", "controller", "data management", "UI", "HTML",
- "API", "function", "var", "component", "design pattern", "React.js", "Angular", "AJAX",
- "DOM", "TypeScript", "ECMAScript", "CLI", "Wijmo", "CoffeeScript", "Elm",
- "plugin", "VueJS", "Knockout", "event", "AngularJS", "pure JS", "data binding", "OOP", "GrapeCity",
- "gauge", "JSX", "mobile", "desktop", "Vue", "template", "server-side", "client-side",
- "SPEC", "RAM", "ECMA"
- }
-
- '' Calling FindText() on a document Or a page builds text maps for each page on the fly.
- '' Reusing cached text maps speeds things up a lot.
- Private Function FindTextPages(ByVal maps As ITextMap(), ByVal tp As FindTextParams) As SortedSet(Of Integer)
- Dim finds = New SortedSet(Of Integer)
- Dim currPageIdx = -1
- For Each map In maps
- currPageIdx = map.Page.Index
- map.FindText(tp, Function(fp_) finds.Add(currPageIdx))
- Next
- Return finds
- End Function
-
- '' Adds a word index to the end of the passed document:
- Private Sub AddWordIndex(ByVal doc As GcPdfDocument, ByVal words As IEnumerable(Of String))
- Dim tStart = Util.TimeNow()
-
- '' Build text maps for all pages to speed up FindText() calls
- Dim textMaps(doc.Pages.Count - 1) As ITextMap
- For i = 0 To doc.Pages.Count - 1
- textMaps(i) = doc.Pages(i).GetTextMap()
- Next
-
- '' Words and page indices where they occur, sorted on words:
- Dim index = New SortedDictionary(Of String, List(Of Integer))()
-
- '' Here the main loop building the index is on key words.
- '' An alternative would be to loop over the pages.
- '' Depending on the relative sizes of the keyword dictionary vs
- '' the number of pages in the document, one or the other might be better,
- '' but this is beyond the scope of this sample.
- For Each word In words
- Dim wholeWord As Boolean = word.IndexOf(" "c) = -1
- Dim pgs = FindTextPages(textMaps, New FindTextParams(word, wholeWord, False))
- '' A very simplistic way of also finding plurals:
- If wholeWord AndAlso Not word.EndsWith("s") Then
- pgs.UnionWith(FindTextPages(textMaps, New FindTextParams(word + "s", wholeWord, False)))
- End If
- If (pgs.Any()) Then
- index.Add(word, pgs.ToList())
- End If
- Next
-
- '' Prepare to render the index. The whole index is built
- '' in a single TextLayout instance, set up to render it
- '' in two columns per page.
- '' The main rendering loop uses the TextLayout.SplitAndBalance method
- '' imports the approach demonstrated in BalancedColumns sample.
- '' The complication here is that we need to associate a link to the
- '' relevant page with each page number rendered, see linkIndices below.
- '' Set up the TextLayout:
- Const margin = 72.0F
- Dim pageWidth = doc.PageSize.Width
- Dim pageHeight = doc.PageSize.Height
- Dim cW = pageWidth - margin * 2
- '' Caption (index letter) format:
- Dim tfCap = New TextFormat() With {
- .FontName = _fontFamily,
- .FontBold = True,
- .FontSize = 16,
- .LineGap = 24
- }
- '' Index word and pages format:
- Dim tfRun = New TextFormat() With {
- .FontName = _fontFamily,
- .FontSize = 10
- }
- '' Page headers/footers:
- Dim tfHdr = New TextFormat() With {
- .FontName = _fontFamily,
- .FontItalic = True,
- .FontSize = 10
- }
- '' FirstLineIndent = -18 sets up hanging indent:
- Dim tl = New TextLayout(72) With {
- .FontCollection = _fc,
- .FirstLineIndent = -18,
- .MaxWidth = pageWidth,
- .MaxHeight = pageHeight,
- .MarginLeft = margin,
- .MarginRight = margin,
- .MarginBottom = margin,
- .MarginTop = margin,
- .ColumnWidth = cW * 0.46F,
- .TextAlignment = TextAlignment.Leading,
- .ParagraphSpacing = 4,
- .LineGapBeforeFirstLine = False
- }
-
- '' The list of text runs created for page numbers:
- Dim pgnumRuns = New List(Of Tuple(Of TextRun, Integer))()
- '' This loop builds the index on the TextLayout, saving the text runs
- '' created for each page number rendered. Note that at this point
- '' (prior to the PerformLayout(true) call) the text runs do not contain any info
- '' about their code points and render locations, so we can only save the text runs here.
- '' Later they will be used to add links to referenced pages in the PDF:
- Dim litera As Char = " "
- For Each kvp In index
- Dim word = kvp.Key
- Dim pageIndices = kvp.Value
- If Char.ToUpper(word(0)) <> litera Then
- litera = Char.ToUpper(word(0))
- tl.Append($"{litera}{ChrW(&H2029)}", tfCap)
- End If
- tl.Append(word, tfRun)
- tl.Append(" ", tfRun)
- For i = 0 To pageIndices.Count - 1
- Dim from_ = pageIndices(i)
- Dim tr = tl.Append((from_ + 1).ToString(), tfRun)
- pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, from_))
- '' We merge sequential pages into "..-M":
- Dim k = i
- For j = i + 1 To pageIndices.Count - 1
- If pageIndices(j) <> pageIndices(j - 1) + 1 Then
- Exit For
- End If
- k = j
- Next
- If (k > i + 1) Then
- tl.Append("-", tfRun)
- Dim to_ = pageIndices(k)
- tr = tl.Append((to_ + 1).ToString(), tfRun)
- pgnumRuns.Add(Tuple.Create(Of TextRun, Integer)(tr, to_))
- '' Fast forward:
- i = k
- End If
- If (i < pageIndices.Count - 1) Then
- tl.Append(", ", tfRun)
- Else
- tl.AppendLine(tfRun)
- End If
- Next
- Next
- '' This calculates the glyphs and lays out the whole index.
- '' The tl.SplitAndBalance() call in the loop below does not require redoing the layout:
- tl.PerformLayout(True)
-
- ''
- '' Now we are ready to split and render the text layout, and also add links to page numbers.
- ''
-
- '' Split areas and options - see BalancedColumns for details:
- Dim psas() As PageSplitArea = {
- New PageSplitArea(tl) With {.MarginLeft = tl.MarginLeft + (cW * 0.54F)}
- }
- Dim tso = New TextSplitOptions(tl) With {
- .KeepParagraphLinesTogether = True
- }
-
- '' First original code point index in the current column:
- Dim cpiStart = 0
- '' Max+1 original code point index in the current column:
- Dim cpiEnd = 0
- '' Current index in pgnumRuns:
- Dim pgnumRunsIdx = 0
-
- '' Method to add links to actual pages over page numbers in the current column:
- Dim linkIndices As Action(Of TextLayout, Page) =
- Sub(tl_, page_)
- cpiEnd += tl_.CodePointCount
- While pgnumRunsIdx < pgnumRuns.Count
- Dim run = pgnumRuns(pgnumRunsIdx)
- Dim textRun = run.Item1
- Dim cpi = textRun.CodePointIndex
- If cpi >= cpiEnd Then
- Exit While
- End If
- cpi -= cpiStart
- Dim rects = tl_.GetTextRects(cpi, textRun.CodePointCount)
- Debug.Assert(rects.Count > 0)
- page_.Annotations.Add(New LinkAnnotation(rects(0).ToRectangleF(), New DestinationFit(run.Item2)))
- pgnumRunsIdx += 1
- End While
- cpiStart += tl_.CodePointCount
- End Sub
-
- '' Split and render the index in 2 columns:
- Dim page = doc.Pages.Add()
- While True
- Dim g = Page.Graphics
- '' Add a simple page header:
- g.DrawString($"Index generated by DsPdf on {tStart:R}", tfHdr,
- New RectangleF(margin, 0, pageWidth - margin * 2, margin),
- TextAlignment.Center, ParagraphAlignment.Center, False)
- '' 'rest' will accept the text that did not fit on this page:
- Dim rest As TextLayout = Nothing
- Dim splitResult = tl.SplitAndBalance(psas, tso, rest)
- '' Render text:
- g.DrawTextLayout(tl, PointF.Empty)
- g.DrawTextLayout(psas(0).TextLayout, PointF.Empty)
- '' Add links from page numbers to pages:
- linkIndices(tl, page)
- linkIndices(psas(0).TextLayout, page)
- '' Are we done yet?
- If splitResult <> SplitResult.Split Then
- Exit While
- End If
- tl = rest
- page = doc.Pages.Add()
- End While
- '' Done:
- End Sub
-
- '' Creates a sample document with 100 pages of 'lorem ipsum':
- Private Function MakeDocumentToIndex() As String
- Const N = 100
- Dim tfile = Path.GetTempFileName()
- Using fsOut = New FileStream(tfile, FileMode.Open, FileAccess.ReadWrite)
- Dim tdoc = New GcPdfDocument()
- '' See StartEndDoc for details on StartDoc/EndDoc mode:
- tdoc.StartDoc(fsOut)
- '' Prep a TextLayout to hold/format the text:
- Dim tl = New TextLayout(72)
- tl.FontCollection = _fc
- tl.DefaultFormat.FontName = _fontFamily
- tl.DefaultFormat.FontSize = 12
- '' Use TextLayout to layout the whole page including margins:
- tl.MaxHeight = tdoc.PageSize.Height
- tl.MaxWidth = tdoc.PageSize.Width
- tl.MarginAll = 72
- tl.FirstLineIndent = 72 / 2
- '' Generate the document:
- For pageIdx = 0 To N - 1
- tl.Append(Util.LoremIpsum(1))
- tl.PerformLayout(True)
- tdoc.NewPage().Graphics.DrawTextLayout(tl, PointF.Empty)
- tl.Clear()
- Next
- tdoc.EndDoc()
- End Using
- Return tfile
- End Function
- End Class
-