GetTablePages.cs
  1. //
  2. // This code is part of Document Solutions for PDF demos.
  3. // Copyright (c) MESCIUS inc. All rights reserved.
  4. //
  5. using System;
  6. using System.IO;
  7. using System.Drawing;
  8. using System.Linq;
  9. using System.Collections.Generic;
  10. using GrapeCity.Documents.Pdf;
  11. using GrapeCity.Documents.Pdf.Recognition;
  12. using GrapeCity.Documents.Text;
  13. using GrapeCity.Documents.Common;
  14. using GCTEXT = GrapeCity.Documents.Text;
  15. using GCDRAW = GrapeCity.Documents.Drawing;
  16.  
  17. namespace DsPdfWeb.Demos
  18. {
  19. // Extract data from a table.
  20. public class GetTablePages
  21. {
  22. public int CreatePDF(Stream stream)
  23. {
  24. const float DPI = 72;
  25. const float margin = 36;
  26. var doc = new GcPdfDocument();
  27.  
  28. var tf = new TextFormat()
  29. {
  30. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
  31. FontSize = 9,
  32. ForeColor = Color.Black
  33. };
  34. var tfHdr = new TextFormat(tf)
  35. {
  36. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
  37. FontSize = 11,
  38. ForeColor = Color.DarkBlue
  39. };
  40. var tfRed = new TextFormat(tf) { ForeColor = Color.Red };
  41.  
  42. using (var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "product-list.pdf")))
  43. {
  44. var page = doc.NewPage();
  45. page.Landscape = true;
  46. var g = page.Graphics;
  47.  
  48. var rc = Common.Util.AddNote(
  49. "This sample loads a PDF that contains a table split between several pages (a product price list), " +
  50. "and extracts the tables on all pages using the Page.GetTable() method. " +
  51. "The extracted data is printed as a list of rows and cells. " +
  52. "The source product list PDF is appended to the generated document for reference.",
  53. page,
  54. new RectangleF(margin, margin, page.Bounds.Width - margin * 2, page.Bounds.Height - margin * 2));
  55.  
  56. var tl = g.CreateTextLayout();
  57. tl.MaxWidth = page.Bounds.Width;
  58. tl.MaxHeight = page.Bounds.Height;
  59. tl.MarginAll = margin;
  60. tl.MarginTop = rc.Bottom;
  61. tl.DefaultTabStops = 165;
  62.  
  63. var docSrc = new GcPdfDocument();
  64. docSrc.Load(fs);
  65.  
  66. for (int i = 0; i < docSrc.Pages.Count; ++i)
  67. {
  68. // TableExtractOptions allow you to fine-tune table recognition accounting for
  69. // specifics of the table formatting:
  70. var teo = new TableExtractOptions();
  71. var GetMinimumDistanceBetweenRows = teo.GetMinimumDistanceBetweenRows;
  72. // In this particular case, we slightly increase the minimum distance between rows
  73. // to make sure cells with wrapped text are not mistaken for two cells:
  74. teo.GetMinimumDistanceBetweenRows = (list) => {
  75. var res = GetMinimumDistanceBetweenRows(list);
  76. return res * 1.2f;
  77. };
  78. var top = i == 0 ? DPI * 2 : DPI;
  79. // Get the table at the specified bounds:
  80. var itable = docSrc.Pages[i].GetTable(new RectangleF(DPI * 0.25f, top, DPI * 8, DPI * 10.5f - top), teo);
  81.  
  82. // Add table data to the text layout:
  83. tl.Append($"\nTable on page {i + 1} of the source document has {itable.Cols.Count} column(s) and {itable.Rows.Count} row(s), table data is:", tfHdr);
  84. tl.AppendParagraphBreak();
  85. for (int row = 0; row < itable.Rows.Count; ++row)
  86. {
  87. var tfmt = row == 0 ? tfHdr : tf;
  88. for (int col = 0; col < itable.Cols.Count; ++col)
  89. {
  90. var cell = itable.GetCell(row, col);
  91. if (col > 0)
  92. tl.Append("\t", tfmt);
  93. if (cell == null)
  94. tl.Append("<no cell>", tfRed);
  95. else
  96. tl.Append(cell.Text, tfmt);
  97. }
  98. tl.AppendLine();
  99. }
  100. }
  101.  
  102. // Print the extracted data:
  103. var to = new TextSplitOptions(tl) { RestMarginTop = margin, MinLinesInFirstParagraph = 2, MinLinesInLastParagraph = 2 };
  104. tl.PerformLayout(true);
  105. while (true)
  106. {
  107. var splitResult = tl.Split(to, out TextLayout rest);
  108. doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
  109. if (splitResult != SplitResult.Split)
  110. break;
  111. tl = rest;
  112. doc.NewPage().Landscape = true;
  113. }
  114.  
  115. // Append the original document for reference:
  116. doc.MergeWithDocument(docSrc);
  117.  
  118. doc.Save(stream);
  119. return doc.Pages.Count;
  120. }
  121. }
  122. }
  123. }
  124.