ExtractText.vb
  1. ''
  2. '' This code is part of Document Solutions for PDF demos.
  3. '' Copyright (c) MESCIUS inc. All rights reserved.
  4. ''
  5. Imports System.IO
  6. Imports System.Drawing
  7. Imports GrapeCity.Documents.Text
  8. Imports GrapeCity.Documents.Pdf
  9. Imports GCTEXT = GrapeCity.Documents.Text
  10. Imports GCDRAW = GrapeCity.Documents.Drawing
  11.  
  12. '' This sample demonstrates how to extract text from an existing PDF.
  13. '' It loads an arbitrary PDF into a temporary GcPdfDocument, then
  14. '' retrieves text from each page of that document using the Page.GetText() method,
  15. '' adds all those texts to a TextLayout And renders it into the current document.
  16. '' An alternative to Page.GetText() Is the method GcPdfDocument.GetText()
  17. '' which retrieves the text from the whole document at once.
  18. Public Class ExtractText
  19. Function CreatePDF(ByVal stream As Stream) As Integer
  20. Dim doc = New GcPdfDocument()
  21. Dim page = doc.NewPage()
  22. Dim rc = Util.AddNote(
  23. "This sample loads an arbitrary PDF into a temporary GcPdfDocument, " +
  24. "then retrieves text from each page of the loaded document using the Page.GetText() method, " +
  25. "adds all those texts to a TextLayout and renders it into the current document. " +
  26. "An alternative to Page.GetText() is the method GcPdfDocument.GetText() " +
  27. "which retrieves the text from the whole document at once.",
  28. page)
  29.  
  30. '' Text format for captions:
  31. Dim tf = New TextFormat() With
  32. {
  33. .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
  34. .FontSize = 14,
  35. .ForeColor = Color.Blue
  36. }
  37. '' Text layout to render the text:
  38. Dim tl = New TextLayout(72)
  39. tl.DefaultFormat.Font = StandardFonts.Times
  40. tl.DefaultFormat.FontSize = 12
  41. tl.MaxWidth = doc.PageSize.Width
  42. tl.MaxHeight = doc.PageSize.Height
  43. tl.MarginAll = rc.Left
  44. tl.MarginTop = rc.Bottom + 36
  45.  
  46. '' Text split options for widow/orphan control:
  47. Dim topt = New TextSplitOptions(tl) With
  48. {
  49. .MinLinesInFirstParagraph = 2,
  50. .MinLinesInLastParagraph = 2,
  51. .RestMarginTop = rc.Left
  52. }
  53.  
  54. '' Open an arbitrary PDF, load it into a temp document and get all page texts:
  55. Using fs As New FileStream(Path.Combine("Resources", "PDFs", "Wetlands.pdf"), FileMode.Open, FileAccess.Read)
  56. Dim doc1 = New GcPdfDocument()
  57. doc1.Load(fs)
  58.  
  59. '' Get the texts of the loaded document's pages:
  60. Dim texts = New List(Of String)()
  61. doc1.Pages.ToList().ForEach(Sub(p_) texts.Add(p_.GetText()))
  62.  
  63. '' Add texts and captions to the text layout:
  64. For i = 0 To texts.Count - 1
  65. tl.AppendLine(String.Format("Text from page {0} of the loaded document:", i + 1), tf)
  66. tl.AppendLine(texts(i))
  67. Next
  68. tl.PerformLayout(True)
  69. While True
  70. '' 'rest' will accept the text that did not fit:
  71. Dim rest As TextLayout = Nothing
  72. Dim splitResult = tl.Split(topt, rest)
  73. doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
  74. If splitResult <> SplitResult.Split Then
  75. Exit While
  76. End If
  77. tl = rest
  78. doc.NewPage()
  79. End While
  80. End Using
  81. '' Done:
  82. doc.Save(stream)
  83. Return doc.Pages.Count
  84. End Function
  85. End Class
  86.