ExtractParagraphs.cs
  1. //
  2. // This code is part of Document Solutions for PDF demos.
  3. // Copyright (c) MESCIUS inc. All rights reserved.
  4. //
  5. using System;
  6. using System.IO;
  7. using System.Drawing;
  8. using System.Numerics;
  9. using System.Collections.Generic;
  10. using System.Linq;
  11. using GrapeCity.Documents.Text;
  12. using GrapeCity.Documents.Drawing;
  13. using GrapeCity.Documents.Pdf;
  14. using GrapeCity.Documents.Pdf.Annotations;
  15. using GrapeCity.Documents.Pdf.Graphics;
  16. using GCTEXT = GrapeCity.Documents.Text;
  17. using GCDRAW = GrapeCity.Documents.Drawing;
  18.  
  19. namespace DsPdfWeb.Demos
  20. {
  21. // This sample demonstrates how to extract text from an existing PDF.
  22. // It loads an arbitrary PDF into a temporary GcPdfDocument, then
  23. // retrieves text from each page of that document using the Page.GetText() method,
  24. // adds all those texts to a TextLayout and renders it into the current document.
  25. // An alternative to Page.GetText() is the method GcPdfDocument.GetText()
  26. // which retrieves the text from the whole document at once.
  27. public class ExtractParagraphs
  28. {
  29. public int CreatePDF(Stream stream)
  30. {
  31. const int margin = 36;
  32. var c1 = Color.PaleGreen;
  33. var c2 = Color.PaleGoldenrod;
  34.  
  35. var doc = new GcPdfDocument();
  36. var page = doc.NewPage();
  37.  
  38. var rc = Common.Util.AddNote(
  39. "Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
  40. "and iterate over the pages of that document, printing all paragraphs found on the page. " +
  41. "We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
  42. "The original PDF is appended to the generated document for reference.",
  43. page,
  44. new RectangleF(margin, margin, page.Size.Width - margin * 2, 0));
  45.  
  46. // Text format for captions:
  47. var tf = new TextFormat()
  48. {
  49. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
  50. FontSize = 14,
  51. ForeColor = Color.Blue
  52. };
  53. // Text format for the paragraphs:
  54. var tfpar = new TextFormat()
  55. {
  56. Font = StandardFonts.Times,
  57. FontSize = 12,
  58. BackColor = c1,
  59. };
  60. // Text layout to render the text:
  61. var tl = page.Graphics.CreateTextLayout();
  62. tl.MaxWidth = doc.PageSize.Width;
  63. tl.MaxHeight = doc.PageSize.Height;
  64. tl.MarginAll = rc.Left;
  65. tl.MarginTop = rc.Bottom + 36;
  66. // Text split options for widow/orphan control:
  67. var to = new TextSplitOptions(tl)
  68. {
  69. MinLinesInFirstParagraph = 2,
  70. MinLinesInLastParagraph = 2,
  71. RestMarginTop = rc.Left,
  72. };
  73.  
  74. // Open an arbitrary PDF, load it into a temp document and get all page texts:
  75. using var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"));
  76. var doc1 = new GcPdfDocument();
  77. doc1.Load(fs);
  78.  
  79. for (int i = 0; i < doc1.Pages.Count; ++i)
  80. {
  81. tl.AppendLine(string.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf);
  82.  
  83. var pg = doc1.Pages[i];
  84. var pars = pg.GetTextMap().Paragraphs;
  85. foreach (var par in pars)
  86. {
  87. tl.AppendLine(par.GetText(), tfpar);
  88. tfpar.BackColor = tfpar.BackColor == c1 ? c2 : c1;
  89. }
  90. }
  91.  
  92. tl.PerformLayout(true);
  93. while (true)
  94. {
  95. // 'rest' will accept the text that did not fit:
  96. var splitResult = tl.Split(to, out TextLayout rest);
  97. doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
  98. if (splitResult != SplitResult.Split)
  99. break;
  100. tl = rest;
  101. doc.NewPage();
  102. }
  103. // Append the original document for reference:
  104. doc.MergeWithDocument(doc1, new MergeDocumentOptions());
  105. // Done:
  106. doc.Save(stream);
  107. return doc.Pages.Count;
  108. }
  109. }
  110. }
  111.