ReadTagsTableData.cs
  1. //
  2. // This code is part of Document Solutions for PDF demos.
  3. // Copyright (c) MESCIUS inc. All rights reserved.
  4. //
  5. using System;
  6. using System.IO;
  7. using System.Drawing;
  8. using System.Linq;
  9. using System.Collections.Generic;
  10. using GrapeCity.Documents.Pdf;
  11. using GrapeCity.Documents.Text;
  12. using GrapeCity.Documents.Pdf.TextMap;
  13. using GrapeCity.Documents.Pdf.Structure;
  14. using GrapeCity.Documents.Pdf.Recognition.Structure;
  15. using GCTEXT = GrapeCity.Documents.Text;
  16. using GCDRAW = GrapeCity.Documents.Drawing;
  17.  
  18. namespace DsPdfWeb.Demos
  19. {
  20. // Find tables and read their data using structure tags.
  21. public class ReadTagsTableData
  22. {
  23. private TextFormat _tf, _tfHdr, _tfPgHdr;
  24. private float _margin = 72;
  25.  
  26. public int CreatePDF(Stream stream)
  27. {
  28. // Set up some text formats:
  29. _tf = new TextFormat()
  30. {
  31. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
  32. FontSize = 9,
  33. ForeColor = Color.Black
  34. };
  35. _tfHdr = new TextFormat(_tf)
  36. {
  37. Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
  38. FontSize = 11,
  39. ForeColor = Color.DarkBlue
  40. };
  41. _tfPgHdr = new TextFormat(_tf)
  42. {
  43. FontSize = 11,
  44. ForeColor = Color.Gray
  45. };
  46.  
  47. // The resulting PDF:
  48. var doc = new GcPdfDocument();
  49. using (var s = File.OpenRead(Path.Combine("Resources", "PDFs", "C1Olap-QuickStart.pdf")))
  50. {
  51. var source = new GcPdfDocument();
  52. source.Load(s);
  53. PrintAllTables(doc, source);
  54. }
  55. // Save the PDF:
  56. doc.Save(stream);
  57. return doc.Pages.Count;
  58. }
  59.  
  60. private void PrintAllTables(GcPdfDocument doc, GcPdfDocument source)
  61. {
  62. // Get the LogicalStructure and top parent element:
  63. LogicalStructure ls = source.GetLogicalStructure();
  64. if (ls == null || ls.Elements == null || ls.Elements.Count == 0)
  65. {
  66. // No structure tags found:
  67. Common.Util.AddNote("No structure tags were found in the source document.", doc.Pages.Add());
  68. return;
  69. }
  70. // The root element:
  71. Element root = ls.Elements[0];
  72.  
  73. // Find and print all tables:
  74. var tables = new List<(TextLayout, Page)>();
  75. root.Children.ToList().FindAll(e_ => e_.StructElement.Type == "Table").ForEach(t_ => tables.Add(PrintTable(t_)));
  76. // Group tables by the page they were found on:
  77. var tablesByPage = tables.GroupBy(t_ => t_.Item2.Index);
  78. // For each page, print all tables found on that page,
  79. // followed by the original page for reference:
  80. foreach (var tbp in tablesByPage)
  81. {
  82. // The page that will contain the extracted table data:
  83. var pgTables = doc.NewPage();
  84. // The page that will contain the source page for reference:
  85. var pgSrc = doc.NewPage();
  86. // Print the original page:
  87. tbp.First().Item2.Draw(pgSrc.Graphics, pgSrc.Bounds);
  88. // Add a page header:
  89. pgSrc.Graphics.DrawString($"Page {tbp.First().Item2.Index + 1} of the source PDF",
  90. _tfPgHdr, new RectangleF(0, 0, pgSrc.Size.Width, _margin), TextAlignment.Center, ParagraphAlignment.Center, false);
  91. //
  92. float maxHeight = pgTables.Size.Height - _margin * 2;
  93. float y = _margin;
  94. // Print all table data. For simplicity sake we assume that all table data will fit on a single page:
  95. foreach (var t in tbp)
  96. {
  97. t.Item1.MaxHeight = maxHeight;
  98. t.Item1.MaxWidth = pgTables.Size.Width - _margin * 2;
  99. pgTables.Graphics.DrawTextLayout(t.Item1, new PointF(_margin, y));
  100. maxHeight -= t.Item1.ContentHeight + _margin;
  101. y += t.Item1.ContentHeight + _margin;
  102. }
  103. }
  104. }
  105.  
  106. private (TextLayout, Page) PrintTable(Element e)
  107. {
  108. if (e.Type != "Table")
  109. throw new Exception($"Unexpected: element type must be 'Table' but it is '{e.Type}'.");
  110.  
  111. List<List<IList<ITextParagraph>>> table = new List<List<IList<ITextParagraph>>>();
  112. int maxCols = 0;
  113. // Select all child elements with type TR - table rows:
  114. void SelectRows(IReadOnlyList<Element> elements)
  115. {
  116. foreach (Element ec in elements)
  117. {
  118. if (ec.HasChildren)
  119. {
  120. if (ec.StructElement.Type == "TR")
  121. {
  122. var cells = ec.Children.ToList().FindAll((e_) => e_.StructElement.Type == "TD").ToArray();
  123. maxCols = Math.Max(maxCols, cells.Length);
  124. List<IList<ITextParagraph>> tableCells = new List<IList<ITextParagraph>>();
  125. foreach (var cell in cells)
  126. tableCells.Add(cell.GetParagraphs());
  127. table.Add(tableCells);
  128. }
  129. else
  130. SelectRows(ec.Children);
  131. }
  132. }
  133. }
  134. SelectRows(e.Children);
  135.  
  136. // show table
  137. var sourcePage = FindPage(e.StructElement);
  138. if (sourcePage == null)
  139. throw new Exception("Unexpected: could not find the default page for the table.");
  140.  
  141. var tl = new TextLayout(72);
  142.  
  143. // Add table data to the text layout:
  144. tl.Append($"\nTable on page {sourcePage.Index + 1} of the source document has {maxCols} column(s) and {table.Count} row(s).\nData by row:", _tfHdr);
  145. tl.AppendParagraphBreak();
  146. int irow = 0;
  147. foreach (var row in table)
  148. {
  149. int icol = 0;
  150. foreach (var cell in row)
  151. {
  152. foreach (var para in cell)
  153. {
  154. tl.Append(para.GetText());
  155. }
  156. if (row.IndexOf(cell) <= row.Count)
  157. tl.Append("\t");
  158. else
  159. tl.AppendLine();
  160. ++icol;
  161. }
  162. ++irow;
  163. tl.AppendLine();
  164. }
  165. return (tl, sourcePage);
  166. }
  167.  
  168. private Page FindPage(StructElement se)
  169. {
  170. if (se.DefaultPage != null)
  171. return se.DefaultPage;
  172. if (se.HasChildren)
  173. foreach (var child in se.Children)
  174. {
  175. var p = FindPage(child);
  176. if (p != null)
  177. return p;
  178. }
  179. return null;
  180. }
  181. }
  182. }
  183.