//
// This code is part of Document Solutions for PDF demos.
// Copyright (c) MESCIUS inc. All rights reserved.
//
using System;
using System.IO;
using System.Drawing;
using System.Numerics;
using System.Collections.Generic;
using System.Linq;
using GrapeCity.Documents.Text;
using GrapeCity.Documents.Drawing;
using GrapeCity.Documents.Pdf;
using GrapeCity.Documents.Pdf.Annotations;
using GrapeCity.Documents.Pdf.Graphics;
using GCTEXT = GrapeCity.Documents.Text;
using GCDRAW = GrapeCity.Documents.Drawing;
namespace DsPdfWeb.Demos
{
// This sample demonstrates how to extract text from an existing PDF.
// It loads an arbitrary PDF into a temporary GcPdfDocument, then
// retrieves text from each page of that document using the Page.GetText() method,
// adds all those texts to a TextLayout and renders it into the current document.
// An alternative to Page.GetText() is the method GcPdfDocument.GetText()
// which retrieves the text from the whole document at once.
public class ExtractParagraphs
{
public int CreatePDF(Stream stream)
{
const int margin = 36;
var c1 = Color.PaleGreen;
var c2 = Color.PaleGoldenrod;
var doc = new GcPdfDocument();
var page = doc.NewPage();
var rc = Common.Util.AddNote(
"Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
"and iterate over the pages of that document, printing all paragraphs found on the page. " +
"We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
"The original PDF is appended to the generated document for reference.",
page,
new RectangleF(margin, margin, page.Size.Width - margin * 2, 0));
// Text format for captions:
var tf = new TextFormat()
{
Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
FontSize = 14,
ForeColor = Color.Blue
};
// Text format for the paragraphs:
var tfpar = new TextFormat()
{
Font = StandardFonts.Times,
FontSize = 12,
BackColor = c1,
};
// Text layout to render the text:
var tl = page.Graphics.CreateTextLayout();
tl.MaxWidth = doc.PageSize.Width;
tl.MaxHeight = doc.PageSize.Height;
tl.MarginAll = rc.Left;
tl.MarginTop = rc.Bottom + 36;
// Text split options for widow/orphan control:
var to = new TextSplitOptions(tl)
{
MinLinesInFirstParagraph = 2,
MinLinesInLastParagraph = 2,
RestMarginTop = rc.Left,
};
// Open an arbitrary PDF, load it into a temp document and get all page texts:
using var fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"));
var doc1 = new GcPdfDocument();
doc1.Load(fs);
for (int i = 0; i < doc1.Pages.Count; ++i)
{
tl.AppendLine(string.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf);
var pg = doc1.Pages[i];
var pars = pg.GetTextMap().Paragraphs;
foreach (var par in pars)
{
tl.AppendLine(par.GetText(), tfpar);
tfpar.BackColor = tfpar.BackColor == c1 ? c2 : c1;
}
}
tl.PerformLayout(true);
while (true)
{
// 'rest' will accept the text that did not fit:
var splitResult = tl.Split(to, out TextLayout rest);
doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty);
if (splitResult != SplitResult.Split)
break;
tl = rest;
doc.NewPage();
}
// Append the original document for reference:
doc.MergeWithDocument(doc1, new MergeDocumentOptions());
// Done:
doc.Save(stream);
return doc.Pages.Count;
}
}
}