ExtractParagraphs.vb
''
'' This code is part of Document Solutions for PDF demos.
'' Copyright (c) MESCIUS inc. All rights reserved.
''
Imports System.IO
Imports System.Drawing
Imports System.Numerics
Imports System.Collections.Generic
Imports System.Linq
Imports GrapeCity.Documents.Text
Imports GrapeCity.Documents.Drawing
Imports GrapeCity.Documents.Pdf
Imports GrapeCity.Documents.Pdf.Annotations
Imports GrapeCity.Documents.Pdf.Graphics
Imports GCTEXT = GrapeCity.Documents.Text
Imports GCDRAW = GrapeCity.Documents.Drawing
 
'' This sample demonstrates how to extract text from an existing PDF.
'' It loads an arbitrary PDF into a temporary GcPdfDocument, then
'' retrieves text from each page of that document using the Page.GetText() method,
'' adds all those texts to a TextLayout And renders it into the current document.
'' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
'' which retrieves the text from the whole document at once.
Public Class ExtractParagraphs
    Function CreatePDF(ByVal stream As Stream) As Integer
        Const margin = 36
        Dim c1 = Color.PaleGreen
        Dim c2 = Color.PaleGoldenrod
 
        Dim doc = New GcPdfDocument()
        Dim page = doc.NewPage()
 
        Dim rc = Util.AddNote(
            "Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
            "and iterate over the pages of that document, printing all paragraphs found on the page. " +
            "We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
            "The original PDF is appended to the generated document for reference.",
            page,
            New RectangleF(margin, margin, page.Size.Width - margin * 2, 0))
 
        '' Text format for captions:
        Dim tf = New TextFormat() With
            {
                .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "NotoSerif-Regular.ttf")),
                .FontSize = 14,
                .ForeColor = Color.Blue
            }
        '' Text format for the paragraphs:
        Dim tfpar = New TextFormat() With
        {
            .Font = StandardFonts.Times,
            .FontSize = 12,
            .BackColor = c1
        }
        '' Text layout to render the text:
        Dim tl = page.Graphics.CreateTextLayout()
        tl.MaxWidth = doc.PageSize.Width
        tl.MaxHeight = doc.PageSize.Height
        tl.MarginAll = rc.Left
        tl.MarginTop = rc.Bottom + 36
        '' Text split options for widow/orphan control:
        Dim topt = New TextSplitOptions(tl) With
        {
            .MinLinesInFirstParagraph = 2,
            .MinLinesInLastParagraph = 2,
            .RestMarginTop = rc.Left
        }
 
        '' Open an arbitrary PDF, load it into a temp document And get all page texts
        Using fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"))
            Dim doc1 = New GcPdfDocument()
            doc1.Load(fs)
 
            For i = 0 To doc1.Pages.Count - 1
                tl.AppendLine(String.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf)
 
                Dim pg = doc1.Pages(i)
                Dim pars = pg.GetTextMap().Paragraphs
                For Each par In pars
                    tl.AppendLine(par.GetText(), tfpar)
                    If tfpar.BackColor = c1 Then
                        tfpar.BackColor = c2
                    Else
                        tfpar.BackColor = c1
                    End If
                Next
            Next
 
            tl.PerformLayout(True)
            While True
                '' 'rest' will accept the text that did not fit:
                Dim rest As TextLayout = Nothing
                Dim splitResult = tl.Split(topt, rest)
                doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
                If splitResult <> SplitResult.Split Then
                    Exit While
                End If
                tl = rest
                doc.NewPage()
            End While
            '' Append the original document for reference:
            doc.MergeWithDocument(doc1, New MergeDocumentOptions())
 
            '' Done:
            doc.Save(stream)
            Return doc.Pages.Count
        End Using
    End Function
End Class