GetTablePages.cs
- //
- // This code is part of Document Solutions for PDF demos.
- // Copyright (c) MESCIUS inc. All rights reserved.
- //
- using System;
- using System.IO;
- using System.Drawing;
- using System.Linq;
- using System.Collections.Generic;
- using GrapeCity.Documents.Pdf;
- using GrapeCity.Documents.Pdf.Recognition;
- using GrapeCity.Documents.Text;
- using GrapeCity.Documents.Common;
- using GCTEXT = GrapeCity.Documents.Text;
- using GCDRAW = GrapeCity.Documents.Drawing;
-
- namespace DsPdfWeb.Demos
- {
- // Extract data from a table.
- public class GetTablePages
- {
- public int CreatePDF(Stream stream)
- {
- const float DPI = 72;
- const float margin = 36;
- var doc = new GcPdfDocument();
-
- var tf = new TextFormat()
- {
- Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
- FontSize = 9,
- ForeColor = Color.Black
- };
- var tfHdr = new TextFormat(tf)
- {
- Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
- FontSize = 11,
- ForeColor = Color.DarkBlue
- };
- var tfRed = new TextFormat(tf) { ForeColor = Color.Red };
-
- using (var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "product-list.pdf")))
- {
- var page = doc.NewPage();
- page.Landscape = true;
- var g = page.Graphics;
-
- var rc = Common.Util.AddNote(
- "This sample loads a PDF that contains a table split between several pages (a product price list), " +
- "and extracts the tables on all pages using the Page.GetTable() method. " +
- "The extracted data is printed as a list of rows and cells. " +
- "The source product list PDF is appended to the generated document for reference.",
- page,
- new RectangleF(margin, margin, page.Bounds.Width - margin * 2, page.Bounds.Height - margin * 2));
-
- var tl = g.CreateTextLayout();
- tl.MaxWidth = page.Bounds.Width;
- tl.MaxHeight = page.Bounds.Height;
- tl.MarginAll = margin;
- tl.MarginTop = rc.Bottom;
- tl.DefaultTabStops = 165;
-
- var docSrc = new GcPdfDocument();
- docSrc.Load(fs);
-
- for (int i = 0; i < docSrc.Pages.Count; ++i)
- {
- // TableExtractOptions allow you to fine-tune table recognition accounting for
- // specifics of the table formatting:
- var teo = new TableExtractOptions();
- var GetMinimumDistanceBetweenRows = teo.GetMinimumDistanceBetweenRows;
- // In this particular case, we slightly increase the minimum distance between rows
- // to make sure cells with wrapped text are not mistaken for two cells:
- teo.GetMinimumDistanceBetweenRows = (list) => {
- var res = GetMinimumDistanceBetweenRows(list);
- return res * 1.2f;
- };
- var top = i == 0 ? DPI * 2 : DPI;
- // Get the table at the specified bounds:
- var itable = docSrc.Pages[i].GetTable(new RectangleF(DPI * 0.25f, top, DPI * 8, DPI * 10.5f - top), teo);
-
- // Add table data to the text layout:
- tl.Append($"\nTable on page {i + 1} of the source document has {itable.Cols.Count} column(s) and {itable.Rows.Count} row(s), table data is:", tfHdr);
- tl.AppendParagraphBreak();
- for (int row = 0; row < itable.Rows.Count; ++row)
- {
- var tfmt = row == 0 ? tfHdr : tf;
- for (int col = 0; col < itable.Cols.Count; ++col)
- {
- var cell = itable.GetCell(row, col);
- if (col > 0)
- tl.Append("\t", tfmt);
- if (cell == null)
- tl.Append("<no cell>", tfRed);
- else
- tl.Append(cell.Text, tfmt);
- }
- tl.AppendLine();
- }
- }
-
- // Print the extracted data:
- var to = new TextSplitOptions(tl) { RestMarginTop = margin, MinLinesInFirstParagraph = 2, MinLinesInLastParagraph = 2 };
- tl.PerformLayout(true);
- while (true)
- {
- var splitResult = tl.Split(to, out TextLayout rest);
- doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
- if (splitResult != SplitResult.Split)
- break;
- tl = rest;
- doc.NewPage().Landscape = true;
- }
-
- // Append the original document for reference:
- doc.MergeWithDocument(docSrc);
-
- doc.Save(stream);
- return doc.Pages.Count;
- }
- }
- }
- }
-