ExtractText.cs
  1. //
  2. // This code is part of Document Solutions for PDF demos.
  3. // Copyright (c) MESCIUS inc. All rights reserved.
  4. //
  5. using System;
  6. using System.IO;
  7. using System.Drawing;
  8. using System.Numerics;
  9. using System.Collections.Generic;
  10. using System.Linq;
  11. using GrapeCity.Documents.Text;
  12. using GrapeCity.Documents.Drawing;
  13. using GrapeCity.Documents.Pdf;
  14. using GrapeCity.Documents.Pdf.Annotations;
  15. using GrapeCity.Documents.Pdf.Graphics;
  16. using GCTEXT = GrapeCity.Documents.Text;
  17. using GCDRAW = GrapeCity.Documents.Drawing;
  18.  
  19. namespace DsPdfWeb.Demos
  20. {
  21. // This sample demonstrates how to extract text from an existing PDF.
  22. // It loads an arbitrary PDF into a temporary GcPdfDocument, then
  23. // retrieves text from each page of that document using the Page.GetText() method,
  24. // adds all those texts to a TextLayout and renders it into the current document.
  25. // An alternative to Page.GetText() is the method GcPdfDocument.GetText()
  26. // which retrieves the text from the whole document at once.
  27. public class ExtractText
  28. {
  29. public int CreatePDF(Stream stream)
  30. {
  31. var doc = new GcPdfDocument();
  32. var page = doc.NewPage();
  33.  
  34. var rc = Common.Util.AddNote(
  35. "This sample loads an arbitrary PDF into a temporary GcPdfDocument, " +
  36. "then retrieves text from each page of the loaded document using the Page.GetText() method, " +
  37. "adds all those texts to a TextLayout and renders it into the current document. " +
  38. "An alternative to Page.GetText() is the method GcPdfDocument.GetText() " +
  39. "which retrieves the text from the whole document at once.",
  40. page);
  41.  
  42. // Text format for captions:
  43. var tf = new TextFormat()
  44. {
  45. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
  46. FontSize = 14,
  47. ForeColor = Color.Blue
  48. };
  49. // Text layout to render the text:
  50. var tl = new TextLayout(72);
  51. tl.DefaultFormat.Font = StandardFonts.Times;
  52. tl.DefaultFormat.FontSize = 12;
  53. tl.MaxWidth = doc.PageSize.Width;
  54. tl.MaxHeight = doc.PageSize.Height;
  55. tl.MarginAll = rc.Left;
  56. tl.MarginTop = rc.Bottom + 36;
  57.  
  58. // Text split options for widow/orphan control:
  59. var to = new TextSplitOptions(tl)
  60. {
  61. MinLinesInFirstParagraph = 2,
  62. MinLinesInLastParagraph = 2,
  63. RestMarginTop = rc.Left,
  64. };
  65.  
  66. // Open an arbitrary PDF, load it into a temp document and get all page texts:
  67. using var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"));
  68.  
  69. var doc1 = new GcPdfDocument();
  70. doc1.Load(fs);
  71.  
  72. // Get the texts of the loaded document's pages:
  73. var texts = new List<string>();
  74. doc1.Pages.ToList().ForEach(p_ => texts.Add(p_.GetText()));
  75.  
  76. // Add texts and captions to the text layout:
  77. for (int i = 0; i < texts.Count; ++i)
  78. {
  79. tl.AppendLine(string.Format("Text from page {0} of the loaded document:", i + 1), tf);
  80. tl.AppendLine(texts[i]);
  81. }
  82. tl.PerformLayout(true);
  83. while (true)
  84. {
  85. // 'rest' will accept the text that did not fit:
  86. var splitResult = tl.Split(to, out TextLayout rest);
  87. doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
  88. if (splitResult != SplitResult.Split)
  89. break;
  90. tl = rest;
  91. doc.NewPage();
  92. }
  93.  
  94. // Done:
  95. doc.Save(stream);
  96. return doc.Pages.Count;
  97. }
  98. }
  99. }
  100.