ExtractParagraphs.vb
- ''
- '' This code is part of Document Solutions for PDF demos.
- '' Copyright (c) MESCIUS inc. All rights reserved.
- ''
- Imports System.IO
- Imports System.Drawing
- Imports System.Numerics
- Imports System.Collections.Generic
- Imports System.Linq
- Imports GrapeCity.Documents.Text
- Imports GrapeCity.Documents.Drawing
- Imports GrapeCity.Documents.Pdf
- Imports GrapeCity.Documents.Pdf.Annotations
- Imports GrapeCity.Documents.Pdf.Graphics
- Imports GCTEXT = GrapeCity.Documents.Text
- Imports GCDRAW = GrapeCity.Documents.Drawing
-
- '' This sample demonstrates how to extract text from an existing PDF.
- '' It loads an arbitrary PDF into a temporary GcPdfDocument, then
- '' retrieves text from each page of that document using the Page.GetText() method,
- '' adds all those texts to a TextLayout And renders it into the current document.
- '' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
- '' which retrieves the text from the whole document at once.
- Public Class ExtractParagraphs
- Function CreatePDF(ByVal stream As Stream) As Integer
- Const margin = 36
- Dim c1 = Color.PaleGreen
- Dim c2 = Color.PaleGoldenrod
-
- Dim doc = New GcPdfDocument()
- Dim page = doc.NewPage()
-
- Dim rc = Util.AddNote(
- "Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
- "and iterate over the pages of that document, printing all paragraphs found on the page. " +
- "We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
- "The original PDF is appended to the generated document for reference.",
- page,
- New RectangleF(margin, margin, page.Size.Width - margin * 2, 0))
-
- '' Text format for captions:
- Dim tf = New TextFormat() With
- {
- .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
- .FontSize = 14,
- .ForeColor = Color.Blue
- }
- '' Text format for the paragraphs:
- Dim tfpar = New TextFormat() With
- {
- .Font = StandardFonts.Times,
- .FontSize = 12,
- .BackColor = c1
- }
- '' Text layout to render the text:
- Dim tl = page.Graphics.CreateTextLayout()
- tl.MaxWidth = doc.PageSize.Width
- tl.MaxHeight = doc.PageSize.Height
- tl.MarginAll = rc.Left
- tl.MarginTop = rc.Bottom + 36
- '' Text split options for widow/orphan control:
- Dim topt = New TextSplitOptions(tl) With
- {
- .MinLinesInFirstParagraph = 2,
- .MinLinesInLastParagraph = 2,
- .RestMarginTop = rc.Left
- }
-
- '' Open an arbitrary PDF, load it into a temp document And get all page texts
- Using fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"))
- Dim doc1 = New GcPdfDocument()
- doc1.Load(fs)
-
- For i = 0 To doc1.Pages.Count - 1
- tl.AppendLine(String.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf)
-
- Dim pg = doc1.Pages(i)
- Dim pars = pg.GetTextMap().Paragraphs
- For Each par In pars
- tl.AppendLine(par.GetText(), tfpar)
- If tfpar.BackColor = c1 Then
- tfpar.BackColor = c2
- Else
- tfpar.BackColor = c1
- End If
- Next
- Next
-
- tl.PerformLayout(True)
- While True
- '' 'rest' will accept the text that did not fit:
- Dim rest As TextLayout = Nothing
- Dim splitResult = tl.Split(topt, rest)
- doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
- If splitResult <> SplitResult.Split Then
- Exit While
- End If
- tl = rest
- doc.NewPage()
- End While
- '' Append the original document for reference:
- doc.MergeWithDocument(doc1, New MergeDocumentOptions())
-
- '' Done:
- doc.Save(stream)
- Return doc.Pages.Count
- End Using
- End Function
- End Class
-