ExtractText.vb
- ''
- '' This code is part of Document Solutions for PDF demos.
- '' Copyright (c) MESCIUS inc. All rights reserved.
- ''
- Imports System.IO
- Imports System.Drawing
- Imports GrapeCity.Documents.Text
- Imports GrapeCity.Documents.Pdf
- Imports GCTEXT = GrapeCity.Documents.Text
- Imports GCDRAW = GrapeCity.Documents.Drawing
-
- '' This sample demonstrates how to extract text from an existing PDF.
- '' It loads an arbitrary PDF into a temporary GcPdfDocument, then
- '' retrieves text from each page of that document using the Page.GetText() method,
- '' adds all those texts to a TextLayout And renders it into the current document.
- '' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
- '' which retrieves the text from the whole document at once.
- Public Class ExtractText
- Function CreatePDF(ByVal stream As Stream) As Integer
- Dim doc = New GcPdfDocument()
- Dim page = doc.NewPage()
- Dim rc = Util.AddNote(
- "This sample loads an arbitrary PDF into a temporary GcPdfDocument, " +
- "then retrieves text from each page of the loaded document using the Page.GetText() method, " +
- "adds all those texts to a TextLayout and renders it into the current document. " +
- "An alternative to Page.GetText() is the method GcPdfDocument.GetText() " +
- "which retrieves the text from the whole document at once.",
- page)
-
- '' Text format for captions:
- Dim tf = New TextFormat() With
- {
- .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
- .FontSize = 14,
- .ForeColor = Color.Blue
- }
- '' Text layout to render the text:
- Dim tl = New TextLayout(72)
- tl.DefaultFormat.Font = StandardFonts.Times
- tl.DefaultFormat.FontSize = 12
- tl.MaxWidth = doc.PageSize.Width
- tl.MaxHeight = doc.PageSize.Height
- tl.MarginAll = rc.Left
- tl.MarginTop = rc.Bottom + 36
-
- '' Text split options for widow/orphan control:
- Dim topt = New TextSplitOptions(tl) With
- {
- .MinLinesInFirstParagraph = 2,
- .MinLinesInLastParagraph = 2,
- .RestMarginTop = rc.Left
- }
-
- '' Open an arbitrary PDF, load it into a temp document and get all page texts:
- Using fs As New FileStream(Path.Combine("Resources", "PDFs", "Wetlands.pdf"), FileMode.Open, FileAccess.Read)
- Dim doc1 = New GcPdfDocument()
- doc1.Load(fs)
-
- '' Get the texts of the loaded document's pages:
- Dim texts = New List(Of String)()
- doc1.Pages.ToList().ForEach(Sub(p_) texts.Add(p_.GetText()))
-
- '' Add texts and captions to the text layout:
- For i = 0 To texts.Count - 1
- tl.AppendLine(String.Format("Text from page {0} of the loaded document:", i + 1), tf)
- tl.AppendLine(texts(i))
- Next
- tl.PerformLayout(True)
- While True
- '' 'rest' will accept the text that did not fit:
- Dim rest As TextLayout = Nothing
- Dim splitResult = tl.Split(topt, rest)
- doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
- If splitResult <> SplitResult.Split Then
- Exit While
- End If
- tl = rest
- doc.NewPage()
- End While
- End Using
- '' Done:
- doc.Save(stream)
- Return doc.Pages.Count
- End Function
- End Class
-