ExtractParagraphs.cs
- //
- // This code is part of Document Solutions for PDF demos.
- // Copyright (c) MESCIUS inc. All rights reserved.
- //
- using System;
- using System.IO;
- using System.Drawing;
- using System.Numerics;
- using System.Collections.Generic;
- using System.Linq;
- using GrapeCity.Documents.Text;
- using GrapeCity.Documents.Drawing;
- using GrapeCity.Documents.Pdf;
- using GrapeCity.Documents.Pdf.Annotations;
- using GrapeCity.Documents.Pdf.Graphics;
- using GCTEXT = GrapeCity.Documents.Text;
- using GCDRAW = GrapeCity.Documents.Drawing;
-
- namespace DsPdfWeb.Demos
- {
- // This sample demonstrates how to extract text from an existing PDF.
- // It loads an arbitrary PDF into a temporary GcPdfDocument, then
- // retrieves text from each page of that document using the Page.GetText() method,
- // adds all those texts to a TextLayout and renders it into the current document.
- // An alternative to Page.GetText() is the method GcPdfDocument.GetText()
- // which retrieves the text from the whole document at once.
- public class ExtractParagraphs
- {
- public int CreatePDF(Stream stream)
- {
- const int margin = 36;
- var c1 = Color.PaleGreen;
- var c2 = Color.PaleGoldenrod;
-
- var doc = new GcPdfDocument();
- var page = doc.NewPage();
-
- var rc = Common.Util.AddNote(
- "Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
- "and iterate over the pages of that document, printing all paragraphs found on the page. " +
- "We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
- "The original PDF is appended to the generated document for reference.",
- page,
- new RectangleF(margin, margin, page.Size.Width - margin * 2, 0));
-
- // Text format for captions:
- var tf = new TextFormat()
- {
- Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
- FontSize = 14,
- ForeColor = Color.Blue
- };
- // Text format for the paragraphs:
- var tfpar = new TextFormat()
- {
- Font = StandardFonts.Times,
- FontSize = 12,
- BackColor = c1,
- };
- // Text layout to render the text:
- var tl = page.Graphics.CreateTextLayout();
- tl.MaxWidth = doc.PageSize.Width;
- tl.MaxHeight = doc.PageSize.Height;
- tl.MarginAll = rc.Left;
- tl.MarginTop = rc.Bottom + 36;
- // Text split options for widow/orphan control:
- var to = new TextSplitOptions(tl)
- {
- MinLinesInFirstParagraph = 2,
- MinLinesInLastParagraph = 2,
- RestMarginTop = rc.Left,
- };
-
- // Open an arbitrary PDF, load it into a temp document and get all page texts:
- using var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"));
- var doc1 = new GcPdfDocument();
- doc1.Load(fs);
-
- for (int i = 0; i < doc1.Pages.Count; ++i)
- {
- tl.AppendLine(string.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf);
-
- var pg = doc1.Pages[i];
- var pars = pg.GetTextMap().Paragraphs;
- foreach (var par in pars)
- {
- tl.AppendLine(par.GetText(), tfpar);
- tfpar.BackColor = tfpar.BackColor == c1 ? c2 : c1;
- }
- }
-
- tl.PerformLayout(true);
- while (true)
- {
- // 'rest' will accept the text that did not fit:
- var splitResult = tl.Split(to, out TextLayout rest);
- doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
- if (splitResult != SplitResult.Split)
- break;
- tl = rest;
- doc.NewPage();
- }
- // Append the original document for reference:
- doc.MergeWithDocument(doc1, new MergeDocumentOptions());
- // Done:
- doc.Save(stream);
- return doc.Pages.Count;
- }
- }
- }
-