In This Topic
Programming / OCR / How to OCR a specific zone of a PDF document

How to OCR a specific zone of a PDF document

In This Topic

If you know the position of the text within your PDF document or you only want to OCR a certain part of the page and ignore the rest, all you have to do is convert the page to an image, set the ROI to the area you desire, and run the OCR engine.

Copy Code
'We assume GdPicture has been correctly installed and unlocked.
Dim oGdPictureOCR As GdPictureOCR = New GdPictureOCR()
Dim oGdPicturePDF As GdPicturePDF = New GdPicturePDF()
'Loading the PDF document.
If oGdPicturePDF.LoadFromFile("input.pdf", False) = GdPictureStatus.OK Then
    'Selecting the first page.
    oGdPicturePDF.SelectPage(1)
    Dim rasterPageID As Integer = oGdPicturePDF.RenderPageToGdPictureImage(200, True)
    'Setting up the image.
    If (oGdPicturePDF.GetStat() = GdPictureStatus.OK) AndAlso
       (oGdPictureOCR.SetImage(rasterPageID) = GdPictureStatus.OK) Then
        Dim results As List(Of String) = New List(Of String)()
        'Setting up the OCR parameters.
        oGdPictureOCR.ResourceFolder = "C:\GdPicture.NET 14\Redist\OCR"
        oGdPictureOCR.AddLanguage(OCRLanguage.English)
        'Setting up the OCR context and the character list.
        oGdPictureOCR.Context = OCRContext.OCRContextSingleLine
        oGdPictureOCR.CharacterSet = "0123456789"
        'Setting the area to be processed by the OCR.
        oGdPictureOCR.SetROI(100, 100, 200, 50)
        'Running the OCR process to recognize the phone number.
        oGdPictureOCR.RunOCR("PhoneNumber")
        If oGdPictureOCR.GetStat() = GdPictureStatus.OK Then
            results.Add("PhoneNumber")
        Else
            MessageBox.Show("Error occurred when performing the first OCR. Status: " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error)
        End If
        'Setting up the OCR context and the character list.
        oGdPictureOCR.Context = OCRContext.OCRContextSingleBlock
        oGdPictureOCR.CharacterSet = ""
        'Setting the area to be processed by the OCR.
        oGdPictureOCR.SetROI(100, 200, 200, 200)
        'Running the OCR process to recognize the address.
        oGdPictureOCR.RunOCR("Address")
        If oGdPictureOCR.GetStat() = GdPictureStatus.OK Then
            results.Add("Address")
        Else
            MessageBox.Show("Error occurred when performing the second OCR. Status: " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error)
        End If
        If results.Count > 0 Then
            'Saving both results to a text file.
            If oGdPictureOCR.SaveAsText(results, "OCR.txt", OCROutputTextFormat.Utf16, True) = GdPictureStatus.OK Then
                MessageBox.Show("Done!", "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Information)
            End If
        End If
        'Releasing the image.
        GdPictureDocumentUtilities.DisposeImage(rasterPageID)
    Else
        MessageBox.Show("Error occurred when setting up the page/image. Status: " + oGdPicturePDF.GetStat().ToString() + " or " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error)
    End If
    'Closing the document.
    oGdPicturePDF.CloseDocument()
Else
    MessageBox.Show("The file can't be loaded. Status: " + oGdPicturePDF.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error)
End If
oGdPictureOCR.Dispose()
oGdPicturePDF.Dispose()
Copy Code
//We assume GdPicture has been correctly installed and unlocked.
GdPictureOCR oGdPictureOCR = new GdPictureOCR();
GdPicturePDF oGdPicturePDF = new GdPicturePDF();
//Loading the PDF document.
if (oGdPicturePDF.LoadFromFile("input.pdf", false) == GdPictureStatus.OK)
{
    //Selecting the first page.
    oGdPicturePDF.SelectPage(1);
    //Rendering the page to a 200 DPI image.
    int rasterPageID = oGdPicturePDF.RenderPageToGdPictureImage(200, true);
    if ((oGdPicturePDF.GetStat() == GdPictureStatus.OK) &&
        //Setting up the image.
        (oGdPictureOCR.SetImage(rasterPageID) == GdPictureStatus.OK))
    {
        List<string> results = new List<string>();
        //Setting up the OCR parameters.
        oGdPictureOCR.ResourceFolder = "C:\\GdPicture.NET 14\\Redist\\OCR";
        oGdPictureOCR.AddLanguage(OCRLanguage.English);
        //Setting up the OCR context and the character list.
        oGdPictureOCR.Context = OCRContext.OCRContextSingleLine;
        oGdPictureOCR.CharacterSet = "0123456789";
        //Setting the area to be processed by the OCR.
        oGdPictureOCR.SetROI(100, 100, 200, 50);
        //Running the OCR process to recognize the phone number.
        oGdPictureOCR.RunOCR("PhoneNumber");
        if (oGdPictureOCR.GetStat() == GdPictureStatus.OK)
            results.Add("PhoneNumber");
        else
            MessageBox.Show("Error occurred when performing the first OCR. Status: " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error);
        //Setting up the OCR context and the character list.
        oGdPictureOCR.Context = OCRContext.OCRContextSingleBlock;
        oGdPictureOCR.CharacterSet = "";
        //Setting the area to be processed by the OCR.
        oGdPictureOCR.SetROI(300, 100, 200, 200);
        //Running the OCR process to recognize the address.
        oGdPictureOCR.RunOCR("Address");
        if (oGdPictureOCR.GetStat() == GdPictureStatus.OK)
            results.Add("Address");
        else
            MessageBox.Show("Error occurred when performing the second OCR. Status: " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error);
        if (results.Count > 0)
        {
            //Saving both results to a text file.
            if (oGdPictureOCR.SaveAsText(results, "OCR.txt", OCROutputTextFormat.Utf16, true) == GdPictureStatus.OK)
                MessageBox.Show("Done!", "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Information);
        }
        //Releasing the image.
        GdPictureDocumentUtilities.DisposeImage(rasterPageID);
    }
    else
    {
        MessageBox.Show("Error occurred when setting up the page/image. Status: " + oGdPicturePDF.GetStat().ToString() + " or " + oGdPictureOCR.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error);
    }
    //Closing the document.
    oGdPicturePDF.CloseDocument();
}
else
{
    MessageBox.Show("The file can't be loaded. Status: " + oGdPicturePDF.GetStat().ToString(), "OCR zone Example", MessageBoxButtons.OK, MessageBoxIcon.Error);
}
//Clearing resources.
oGdPictureOCR.Dispose();
oGdPicturePDF.Dispose();