ExtractText.vb
''
'' This code is part of Document Solutions for PDF demos.
'' Copyright (c) MESCIUS inc. All rights reserved.
''
Imports System.IO
Imports System.Drawing
Imports GrapeCity.Documents.Text
Imports GrapeCity.Documents.Pdf
Imports GCTEXT = GrapeCity.Documents.Text
Imports GCDRAW = GrapeCity.Documents.Drawing
 
'' This sample demonstrates how to extract text from an existing PDF.
'' It loads an arbitrary PDF into a temporary GcPdfDocument, then
'' retrieves text from each page of that document using the Page.GetText() method,
'' adds all those texts to a TextLayout And renders it into the current document.
'' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
'' which retrieves the text from the whole document at once.
Public Class ExtractText
    Function CreatePDF(ByVal stream As Stream) As Integer
        Dim doc = New GcPdfDocument()
        Dim page = doc.NewPage()
        Dim rc = Util.AddNote(
            "This sample loads an arbitrary PDF into a temporary GcPdfDocument, " +
            "then retrieves text from each page of the loaded document using the Page.GetText() method, " +
            "adds all those texts to a TextLayout and renders it into the current document. " +
            "An alternative to Page.GetText() is the method GcPdfDocument.GetText() " +
            "which retrieves the text from the whole document at once.",
            page)
 
        '' Text format for captions:
        Dim tf = New TextFormat() With
        {
            .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "NotoSerif-Regular.ttf")),
            .FontSize = 14,
            .ForeColor = Color.Blue
        }
        '' Text layout to render the text:
        Dim tl = New TextLayout(72)
        tl.DefaultFormat.Font = StandardFonts.Times
        tl.DefaultFormat.FontSize = 12
        tl.MaxWidth = doc.PageSize.Width
        tl.MaxHeight = doc.PageSize.Height
        tl.MarginAll = rc.Left
        tl.MarginTop = rc.Bottom + 36
 
        '' Text split options for widow/orphan control:
        Dim topt = New TextSplitOptions(tl) With
        {
            .MinLinesInFirstParagraph = 2,
            .MinLinesInLastParagraph = 2,
            .RestMarginTop = rc.Left
        }
 
        '' Open an arbitrary PDF, load it into a temp document and get all page texts:
        Using fs As New FileStream(Path.Combine("Resources", "PDFs", "Wetlands.pdf"), FileMode.Open, FileAccess.Read)
            Dim doc1 = New GcPdfDocument()
            doc1.Load(fs)
 
            '' Get the texts of the loaded document's pages:
            Dim texts = New List(Of String)()
            doc1.Pages.ToList().ForEach(Sub(p_) texts.Add(p_.GetText()))
 
            '' Add texts and captions to the text layout:
            For i = 0 To texts.Count - 1
                tl.AppendLine(String.Format("Text from page {0} of the loaded document:", i + 1), tf)
                tl.AppendLine(texts(i))
            Next
            tl.PerformLayout(True)
            While True
                '' 'rest' will accept the text that did not fit:
                Dim rest As TextLayout = Nothing
                Dim splitResult = tl.Split(topt, rest)
                doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
                If splitResult <> SplitResult.Split Then
                    Exit While
                End If
                tl = rest
                doc.NewPage()
            End While
        End Using
        '' Done:
        doc.Save(stream)
        Return doc.Pages.Count
    End Function
End Class