ExtractParagraphs.vb
  1. ''
  2. '' This code is part of Document Solutions for PDF demos.
  3. '' Copyright (c) MESCIUS inc. All rights reserved.
  4. ''
  5. Imports System.IO
  6. Imports System.Drawing
  7. Imports System.Numerics
  8. Imports System.Collections.Generic
  9. Imports System.Linq
  10. Imports GrapeCity.Documents.Text
  11. Imports GrapeCity.Documents.Drawing
  12. Imports GrapeCity.Documents.Pdf
  13. Imports GrapeCity.Documents.Pdf.Annotations
  14. Imports GrapeCity.Documents.Pdf.Graphics
  15. Imports GCTEXT = GrapeCity.Documents.Text
  16. Imports GCDRAW = GrapeCity.Documents.Drawing
  17.  
  18. '' This sample demonstrates how to extract text from an existing PDF.
  19. '' It loads an arbitrary PDF into a temporary GcPdfDocument, then
  20. '' retrieves text from each page of that document using the Page.GetText() method,
  21. '' adds all those texts to a TextLayout And renders it into the current document.
  22. '' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
  23. '' which retrieves the text from the whole document at once.
  24. Public Class ExtractParagraphs
  25. Function CreatePDF(ByVal stream As Stream) As Integer
  26. Const margin = 36
  27. Dim c1 = Color.PaleGreen
  28. Dim c2 = Color.PaleGoldenrod
  29.  
  30. Dim doc = New GcPdfDocument()
  31. Dim page = doc.NewPage()
  32.  
  33. Dim rc = Util.AddNote(
  34. "Here we load an existing PDF (Wetlands) into a temporary GcPdfDocument, " +
  35. "and iterate over the pages of that document, printing all paragraphs found on the page. " +
  36. "We alternate the background color for the paragraphs so that the bounds between paragraphs are more clear. " +
  37. "The original PDF is appended to the generated document for reference.",
  38. page,
  39. New RectangleF(margin, margin, page.Size.Width - margin * 2, 0))
  40.  
  41. '' Text format for captions:
  42. Dim tf = New TextFormat() With
  43. {
  44. .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
  45. .FontSize = 14,
  46. .ForeColor = Color.Blue
  47. }
  48. '' Text format for the paragraphs:
  49. Dim tfpar = New TextFormat() With
  50. {
  51. .Font = StandardFonts.Times,
  52. .FontSize = 12,
  53. .BackColor = c1
  54. }
  55. '' Text layout to render the text:
  56. Dim tl = page.Graphics.CreateTextLayout()
  57. tl.MaxWidth = doc.PageSize.Width
  58. tl.MaxHeight = doc.PageSize.Height
  59. tl.MarginAll = rc.Left
  60. tl.MarginTop = rc.Bottom + 36
  61. '' Text split options for widow/orphan control:
  62. Dim topt = New TextSplitOptions(tl) With
  63. {
  64. .MinLinesInFirstParagraph = 2,
  65. .MinLinesInLastParagraph = 2,
  66. .RestMarginTop = rc.Left
  67. }
  68.  
  69. '' Open an arbitrary PDF, load it into a temp document And get all page texts
  70. Using fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"))
  71. Dim doc1 = New GcPdfDocument()
  72. doc1.Load(fs)
  73.  
  74. For i = 0 To doc1.Pages.Count - 1
  75. tl.AppendLine(String.Format("Paragraphs from page {0} of the original PDF:", i + 1), tf)
  76.  
  77. Dim pg = doc1.Pages(i)
  78. Dim pars = pg.GetTextMap().Paragraphs
  79. For Each par In pars
  80. tl.AppendLine(par.GetText(), tfpar)
  81. If tfpar.BackColor = c1 Then
  82. tfpar.BackColor = c2
  83. Else
  84. tfpar.BackColor = c1
  85. End If
  86. Next
  87. Next
  88.  
  89. tl.PerformLayout(True)
  90. While True
  91. '' 'rest' will accept the text that did not fit:
  92. Dim rest As TextLayout = Nothing
  93. Dim splitResult = tl.Split(topt, rest)
  94. doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
  95. If splitResult <> SplitResult.Split Then
  96. Exit While
  97. End If
  98. tl = rest
  99. doc.NewPage()
  100. End While
  101. '' Append the original document for reference:
  102. doc.MergeWithDocument(doc1, New MergeDocumentOptions())
  103.  
  104. '' Done:
  105. doc.Save(stream)
  106. Return doc.Pages.Count
  107. End Using
  108. End Function
  109. End Class
  110.