''
'' This code is part of Document Solutions for PDF demos.
'' Copyright (c) MESCIUS inc. All rights reserved.
''
Imports System.IO
Imports System.Drawing
Imports System.Numerics
Imports System.Collections.Generic
Imports System.Linq
Imports GrapeCity.Documents.Text
Imports GrapeCity.Documents.Drawing
Imports GrapeCity.Documents.Pdf
Imports GrapeCity.Documents.Pdf.Annotations
Imports GrapeCity.Documents.Pdf.Graphics
Imports GCTEXT = GrapeCity.Documents.Text
Imports GCDRAW = GrapeCity.Documents.Drawing
'' This sample demonstrates how to extract text from an existing PDF.
'' It loads an arbitrary PDF into a temporary GcPdfDocument, then
'' retrieves text from each page of that document using the Page.GetText() method,
'' adds all those texts to a TextLayout And renders it into the current document.
'' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
'' which retrieves the text from the whole document at once.
Public Class ExtractParagraphs
Function CreatePDF(ByVal stream As Stream) As Integer
Const margin = 36
Dim c1 = Color.PaleGreen
Dim c2 = Color.PaleGoldenrod
Dim doc = New GcPdfDocument()
Dim page = doc.NewPage()
Dim rc = Util.AddNote(
"Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
"and iterate over the pages of that document, printing all paragraphs found on the page. " +
"We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
"The original PDF is appended to the generated document for reference.",
page,
New RectangleF(margin, margin, page.Size.Width - margin * 2, 0))
'' Text format for captions:
Dim tf = New TextFormat() With
{
.Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
.FontSize = 14,
.ForeColor = Color.Blue
}
'' Text format for the paragraphs:
Dim tfpar = New TextFormat() With
{
.Font = StandardFonts.Times,
.FontSize = 12,
.BackColor = c1
}
'' Text layout to render the text:
Dim tl = page.Graphics.CreateTextLayout()
tl.MaxWidth = doc.PageSize.Width
tl.MaxHeight = doc.PageSize.Height
tl.MarginAll = rc.Left
tl.MarginTop = rc.Bottom + 36
'' Text split options for widow/orphan control:
Dim topt = New TextSplitOptions(tl) With
{
.MinLinesInFirstParagraph = 2,
.MinLinesInLastParagraph = 2,
.RestMarginTop = rc.Left
}
'' Open an arbitrary PDF, load it into a temp document And get all page texts
Using fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"))
Dim doc1 = New GcPdfDocument()
doc1.Load(fs)
For i = 0 To doc1.Pages.Count - 1
tl.AppendLine(String.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf)
Dim pg = doc1.Pages(i)
Dim pars = pg.GetTextMap().Paragraphs
For Each par In pars
tl.AppendLine(par.GetText(), tfpar)
If tfpar.BackColor = c1 Then
tfpar.BackColor = c2
Else
tfpar.BackColor = c1
End If
Next
Next
tl.PerformLayout(True)
While True
'' 'rest' will accept the text that did not fit:
Dim rest As TextLayout = Nothing
Dim splitResult = tl.Split(topt, rest)
doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
If splitResult <> SplitResult.Split Then
Exit While
End If
tl = rest
doc.NewPage()
End While
'' Append the original document for reference:
doc.MergeWithDocument(doc1, New MergeDocumentOptions())
'' Done:
doc.Save(stream)
Return doc.Pages.Count
End Using
End Function
End Class