WordIndex.cs
- //
- // This code is part of Document Solutions for PDF demos.
- // Copyright (c) MESCIUS inc. All rights reserved.
- //
- using System;
- using System.IO;
- using System.Drawing;
- using System.Linq;
- using System.Collections.Generic;
- using GrapeCity.Documents.Pdf;
- using GrapeCity.Documents.Pdf.TextMap;
- using GrapeCity.Documents.Text;
- using GrapeCity.Documents.Common;
- using GrapeCity.Documents.Pdf.Annotations;
-
- namespace DsPdfWeb.Demos.Basics
- {
- // This sample loads an existing PDF, and using a predefined list of key words,
- // builds an alphabetical index of those words linked to pages where they occur
- // in the document. The generated index pages are appended to the original document,
- // and saved in a new PDF.
- // The index is rendered in two balanced columns, using the technique
- // demonstrated in the BalancedColumns sample.
- //
- // NOTE: if you download this sample and run it locally on your own system
- // without a valid DsPdf license, only the first five pages of the sample PDF
- // will be loaded, and the index will be generated for those five pages only.
- public class WordIndex
- {
- // Font collection to hold the fonts we need:
- private FontCollection _fc = new FontCollection();
- // Font family used throughout this sample (this is not case-sensitive):
- private const string _fontFamily = "segoe ui";
-
- // Main sample entry:
- public int CreatePDF(Stream stream)
- {
- // Set up a font collection with the fonts we need:
- _fc.RegisterDirectory(Path.Combine("Resources", "Fonts"));
-
- // Get the PDF to add index to:
- string tfile = Path.Combine("Resources", "PDFs", "CompleteJavaScriptBook.pdf");
-
- // The list of words on which we will build the index:
- var words = _keywords.Distinct(StringComparer.InvariantCultureIgnoreCase).Where(w_ => !string.IsNullOrEmpty(w_));
-
- // Load the PDF and add the index:
- using (var fs = File.OpenRead(tfile))
- {
- var doc = new GcPdfDocument();
- doc.Load(fs);
- //
- int origPageCount = doc.Pages.Count;
- // Build and add the index:
- AddWordIndex(doc, words);
- // Open document on the first index page by default
- // (may not work in browser viewers, but works in Acrobat):
- doc.OpenAction = new DestinationFit(origPageCount);
- // Done:
- doc.Save(stream);
- return doc.Pages.Count;
- }
- }
-
- // The list of words to build the index on:
- private readonly string[] _keywords = new string[]
- {
- "JavaScript", "Framework", "MVC", "npm", "URL", "CDN", "HTML5", "CSS", "ES2015", "web",
- "Node.js", "API", "model", "view", "controller", "data management", "UI", "HTML",
- "API", "function", "var", "component", "design pattern", "React.js", "Angular", "AJAX",
- "DOM", "TypeScript", "ECMAScript", "CLI", "Wijmo", "CoffeeScript", "Elm",
- "plugin", "VueJS", "Knockout", "event", "AngularJS", "pure JS", "data binding", "OOP", "GrapeCity",
- "gauge", "JSX", "mobile", "desktop", "Vue", "template", "server-side", "client-side",
- "SPEC", "RAM", "ECMA",
- };
-
- // Calling FindText() on a document or a page builds text maps for each page on the fly.
- // Reusing cached text maps speeds things up a lot.
- private SortedSet<int> FindTextPages(ITextMap[] maps, FindTextParams tp)
- {
- var finds = new SortedSet<int>();
- int currPageIdx = -1;
- foreach (var map in maps)
- {
- currPageIdx = map.Page.Index;
- map.FindText(tp, (fp_) => finds.Add(currPageIdx));
- }
- return finds;
- }
-
- // Adds a word index to the end of the passed document:
- private void AddWordIndex(GcPdfDocument doc, IEnumerable<string> words)
- {
- var tStart = Common.Util.TimeNow();
-
- // Build text maps for all pages to speed up FindText() calls:
- var textMaps = new ITextMap[doc.Pages.Count];
- for (int i = 0; i < doc.Pages.Count; ++i)
- textMaps[i] = doc.Pages[i].GetTextMap();
-
- // Words and page indices where they occur, sorted on words:
- SortedDictionary<string, List<int>> index = new SortedDictionary<string, List<int>>();
-
- // Here the main loop building the index is on key words.
- // An alternative would be to loop over the pages.
- // Depending on the relative sizes of the keyword dictionary vs
- // the number of pages in the document, one or the other might be better,
- // but this is beyond the scope of this sample.
- foreach (string word in words)
- {
- bool wholeWord = word.IndexOf(' ') == -1;
- var pgs = FindTextPages(textMaps, new FindTextParams(word, wholeWord, false));
- // A very simplistic way of also finding plurals:
- if (wholeWord && !word.EndsWith('s'))
- pgs.UnionWith(FindTextPages(textMaps, new FindTextParams(word + "s", wholeWord, false)));
- if (pgs.Any())
- index.Add(word, pgs.ToList());
- }
-
- // Prepare to render the index. The whole index is built
- // in a single TextLayout instance, set up to render it
- // in two columns per page.
- // The main rendering loop uses the TextLayout.SplitAndBalance method
- // using the approach demonstrated in BalancedColumns sample.
- // The complication here is that we need to associate a link to the
- // relevant page with each page number rendered, see linkIndices below.
- // Set up the TextLayout:
- const float margin = 72;
- var pageWidth = doc.PageSize.Width;
- var pageHeight = doc.PageSize.Height;
- var cW = pageWidth - margin * 2;
- // Caption (index letter) format:
- var tfCap = new TextFormat()
- {
- FontName = _fontFamily,
- FontBold = true,
- FontSize = 16,
- LineGap = 24,
- };
- // Index word and pages format:
- var tfRun = new TextFormat()
- {
- FontName = _fontFamily,
- FontSize = 10,
- };
- // Page headers/footers:
- var tfHdr = new TextFormat()
- {
- FontName = _fontFamily,
- FontItalic = true,
- FontSize = 10,
- };
- // FirstLineIndent = -18 sets up hanging indent:
- var tl = new TextLayout(72)
- {
- FontCollection = _fc,
- FirstLineIndent = -18,
- MaxWidth = pageWidth,
- MaxHeight = pageHeight,
- MarginLeft = margin,
- MarginRight = margin,
- MarginBottom = margin,
- MarginTop = margin,
- ColumnWidth = cW * 0.46f,
- TextAlignment = TextAlignment.Leading,
- ParagraphSpacing = 4,
- LineGapBeforeFirstLine = false,
- };
-
- // The list of text runs created for page numbers:
- List<Tuple<TextRun, int>> pgnumRuns = new List<Tuple<TextRun, int>>();
- // This loop builds the index on the TextLayout, saving the text runs
- // created for each page number rendered. Note that at this point
- // (prior to the PerformLayout(true) call) the text runs do not contain any info
- // about their code points and render locations, so we can only save the text runs here.
- // Later they will be used to add links to referenced pages in the PDF:
- char litera = ' ';
- foreach (KeyValuePair<string, List<int>> kvp in index)
- {
- var word = kvp.Key;
- var pageIndices = kvp.Value;
- if (Char.ToUpper(word[0]) != litera)
- {
- litera = Char.ToUpper(word[0]);
- tl.Append($"{litera}\u2029", tfCap);
- }
- tl.Append(word, tfRun);
- tl.Append(" ", tfRun);
- for (int i = 0; i < pageIndices.Count; ++i)
- {
- var from = pageIndices[i];
- var tr = tl.Append((from + 1).ToString(), tfRun);
- pgnumRuns.Add(Tuple.Create(tr, from));
- // We merge sequential pages into "..-M":
- int k = i;
- for (int j = i + 1; j < pageIndices.Count && pageIndices[j] == pageIndices[j - 1] + 1; ++j)
- k = j;
- if (k > i + 1)
- {
- tl.Append("-", tfRun);
- var to = pageIndices[k];
- tr = tl.Append((to + 1).ToString(), tfRun);
- pgnumRuns.Add(Tuple.Create(tr, to));
- // Fast forward:
- i = k;
- }
- if (i < pageIndices.Count - 1)
- tl.Append(", ", tfRun);
- else
- tl.AppendLine(tfRun);
- }
- }
- // This calculates the glyphs and lays out the whole index.
- // The tl.SplitAndBalance() call in the loop below does not require redoing the layout:
- tl.PerformLayout(true);
-
- //
- // Now we are ready to split and render the text layout, and also add links to page numbers.
- //
-
- // Split areas and options - see BalancedColumns for details:
- var psas = new PageSplitArea[] {
- new PageSplitArea(tl) { MarginLeft = tl.MarginLeft + (cW * 0.54f) },
- };
- var tso = new TextSplitOptions(tl)
- {
- KeepParagraphLinesTogether = true,
- };
-
- // First original code point index in the current column:
- int cpiStart = 0;
- // Max+1 original code point index in the current column:
- int cpiEnd = 0;
- // Current index in pgnumRuns:
- int pgnumRunsIdx = 0;
- // Split and render the index in 2 columns:
- for (var page = doc.Pages.Add(); ; page = doc.Pages.Add())
- {
- var g = page.Graphics;
- // Add a simple page header:
- g.DrawString($"Index generated by DsPdf on {tStart:R}", tfHdr,
- new RectangleF(margin, 0, pageWidth - margin * 2, margin),
- TextAlignment.Center, ParagraphAlignment.Center, false);
- // 'rest' will accept the text that did not fit on this page:
- var splitResult = tl.SplitAndBalance(psas, tso, out TextLayout rest);
- // Render text:
- g.DrawTextLayout(tl, PointF.Empty);
- g.DrawTextLayout(psas[0].TextLayout, PointF.Empty);
- // Add links from page numbers to pages:
- linkIndices(tl, page);
- linkIndices(psas[0].TextLayout, page);
- // Are we done yet?
- if (splitResult != SplitResult.Split)
- break;
- tl = rest;
- }
- // Done:
- return;
-
- // Method to add links to actual pages over page numbers in the current column:
- void linkIndices(TextLayout tl_, Page page_)
- {
- cpiEnd += tl_.CodePointCount;
- for (; pgnumRunsIdx < pgnumRuns.Count; ++pgnumRunsIdx)
- {
- var run = pgnumRuns[pgnumRunsIdx];
- var textRun = run.Item1;
- int cpi = textRun.CodePointIndex;
- if (cpi >= cpiEnd)
- break;
- cpi -= cpiStart;
- var rects = tl_.GetTextRects(cpi, textRun.CodePointCount);
- System.Diagnostics.Debug.Assert(rects.Count > 0);
- page_.Annotations.Add(new LinkAnnotation(rects[0].ToRectangleF(), new DestinationFit(run.Item2)));
- }
- cpiStart += tl_.CodePointCount;
- }
- }
-
- // Creates a sample document with 100 pages of 'lorem ipsum':
- private string MakeDocumentToIndex()
- {
- const int N = 100;
- string tfile = Path.GetTempFileName();
- using (var fsOut = File.OpenRead(tfile))
- {
- var tdoc = new GcPdfDocument();
- // See StartEndDoc for details on StartDoc/EndDoc mode:
- tdoc.StartDoc(fsOut);
- // Prep a TextLayout to hold/format the text:
- var tl = new TextLayout(72);
- tl.FontCollection = _fc;
- tl.DefaultFormat.FontName = _fontFamily;
- tl.DefaultFormat.FontSize = 12;
- // Use TextLayout to layout the whole page including margins:
- tl.MaxHeight = tdoc.PageSize.Height;
- tl.MaxWidth = tdoc.PageSize.Width;
- tl.MarginAll = 72;
- tl.FirstLineIndent = 72 / 2;
- // Generate the document:
- for (int pageIdx = 0; pageIdx < N; ++pageIdx)
- {
- tl.Append(Common.Util.LoremIpsum(1));
- tl.PerformLayout(true);
- tdoc.NewPage().Graphics.DrawTextLayout(tl, PointF.Empty);
- tl.Clear();
- }
- tdoc.EndDoc();
- }
- return tfile;
- }
- }
- }
-