ReadTagsTableData.cs
- //
- // This code is part of Document Solutions for PDF demos.
- // Copyright (c) MESCIUS inc. All rights reserved.
- //
- using System;
- using System.IO;
- using System.Drawing;
- using System.Linq;
- using System.Collections.Generic;
- using GrapeCity.Documents.Pdf;
- using GrapeCity.Documents.Text;
- using GrapeCity.Documents.Pdf.TextMap;
- using GrapeCity.Documents.Pdf.Structure;
- using GrapeCity.Documents.Pdf.Recognition.Structure;
- using GCTEXT = GrapeCity.Documents.Text;
- using GCDRAW = GrapeCity.Documents.Drawing;
-
- namespace DsPdfWeb.Demos
- {
- // Find tables and read their data using structure tags.
- public class ReadTagsTableData
- {
- private TextFormat _tf, _tfHdr, _tfPgHdr;
- private float _margin = 72;
-
- public int CreatePDF(Stream stream)
- {
- // Set up some text formats:
- _tf = new TextFormat()
- {
- Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
- FontSize = 9,
- ForeColor = Color.Black
- };
- _tfHdr = new TextFormat(_tf)
- {
- Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
- FontSize = 11,
- ForeColor = Color.DarkBlue
- };
- _tfPgHdr = new TextFormat(_tf)
- {
- FontSize = 11,
- ForeColor = Color.Gray
- };
-
- // The resulting PDF:
- var doc = new GcPdfDocument();
- using (var s = File.OpenRead(Path.Combine("Resources", "PDFs", "C1Olap-QuickStart.pdf")))
- {
- var source = new GcPdfDocument();
- source.Load(s);
- PrintAllTables(doc, source);
- }
- // Save the PDF:
- doc.Save(stream);
- return doc.Pages.Count;
- }
-
- private void PrintAllTables(GcPdfDocument doc, GcPdfDocument source)
- {
- // Get the LogicalStructure and top parent element:
- LogicalStructure ls = source.GetLogicalStructure();
- if (ls == null || ls.Elements == null || ls.Elements.Count == 0)
- {
- // No structure tags found:
- Common.Util.AddNote("No structure tags were found in the source document.", doc.Pages.Add());
- return;
- }
- // The root element:
- Element root = ls.Elements[0];
-
- // Find and print all tables:
- var tables = new List<(TextLayout, Page)>();
- root.Children.ToList().FindAll(e_ => e_.StructElement.Type == "Table").ForEach(t_ => tables.Add(PrintTable(t_)));
- // Group tables by the page they were found on:
- var tablesByPage = tables.GroupBy(t_ => t_.Item2.Index);
- // For each page, print all tables found on that page,
- // followed by the original page for reference:
- foreach (var tbp in tablesByPage)
- {
- // The page that will contain the extracted table data:
- var pgTables = doc.NewPage();
- // The page that will contain the source page for reference:
- var pgSrc = doc.NewPage();
- // Print the original page:
- tbp.First().Item2.Draw(pgSrc.Graphics, pgSrc.Bounds);
- // Add a page header:
- pgSrc.Graphics.DrawString($"Page {tbp.First().Item2.Index + 1} of the source PDF",
- _tfPgHdr, new RectangleF(0, 0, pgSrc.Size.Width, _margin), TextAlignment.Center, ParagraphAlignment.Center, false);
- //
- float maxHeight = pgTables.Size.Height - _margin * 2;
- float y = _margin;
- // Print all table data. For simplicity sake we assume that all table data will fit on a single page:
- foreach (var t in tbp)
- {
- t.Item1.MaxHeight = maxHeight;
- t.Item1.MaxWidth = pgTables.Size.Width - _margin * 2;
- pgTables.Graphics.DrawTextLayout(t.Item1, new PointF(_margin, y));
- maxHeight -= t.Item1.ContentHeight + _margin;
- y += t.Item1.ContentHeight + _margin;
- }
- }
- }
-
- private (TextLayout, Page) PrintTable(Element e)
- {
- if (e.Type != "Table")
- throw new Exception($"Unexpected: element type must be 'Table' but it is '{e.Type}'.");
-
- List<List<IList<ITextParagraph>>> table = new List<List<IList<ITextParagraph>>>();
- int maxCols = 0;
- // Select all child elements with type TR - table rows:
- void SelectRows(IReadOnlyList<Element> elements)
- {
- foreach (Element ec in elements)
- {
- if (ec.HasChildren)
- {
- if (ec.StructElement.Type == "TR")
- {
- var cells = ec.Children.ToList().FindAll((e_) => e_.StructElement.Type == "TD").ToArray();
- maxCols = Math.Max(maxCols, cells.Length);
- List<IList<ITextParagraph>> tableCells = new List<IList<ITextParagraph>>();
- foreach (var cell in cells)
- tableCells.Add(cell.GetParagraphs());
- table.Add(tableCells);
- }
- else
- SelectRows(ec.Children);
- }
- }
- }
- SelectRows(e.Children);
-
- // show table
- var sourcePage = FindPage(e.StructElement);
- if (sourcePage == null)
- throw new Exception("Unexpected: could not find the default page for the table.");
-
- var tl = new TextLayout(72);
-
- // Add table data to the text layout:
- tl.Append($"\nTable on page {sourcePage.Index + 1} of the source document has {maxCols} column(s) and {table.Count} row(s).\nData by row:", _tfHdr);
- tl.AppendParagraphBreak();
- int irow = 0;
- foreach (var row in table)
- {
- int icol = 0;
- foreach (var cell in row)
- {
- foreach (var para in cell)
- {
- tl.Append(para.GetText());
- }
- if (row.IndexOf(cell) <= row.Count)
- tl.Append("\t");
- else
- tl.AppendLine();
- ++icol;
- }
- ++irow;
- tl.AppendLine();
- }
- return (tl, sourcePage);
- }
-
- private Page FindPage(StructElement se)
- {
- if (se.DefaultPage != null)
- return se.DefaultPage;
- if (se.HasChildren)
- foreach (var child in se.Children)
- {
- var p = FindPage(child);
- if (p != null)
- return p;
- }
- return null;
- }
- }
- }
-