Frequently Asked Question:
Extract text and images and insert into new PDF
How do I extract all the text and images from one PDF and draw it onto pages in a new PDF?
Here is a code sample (Delphi and C# examples show below) that demonstrates how to extract all of the text and images from one PDF and draw them on to as many pages as required in a new PDF.
Delphi Code:
var
FH: Integer;
PR: Integer;
SL: TStringList;
Data: string;
Font: string;
Color: string;
Size: string;
X1, Y1, X2, Y2, X3, Y3, X4, Y4: string;
Text: string;
X: Integer;
IL: Integer;
TextBlockLeft: Double;
TextBlockTop: Double;
PageNum: Integer;
ImageData: string;
ImageLeft, ImageTop, ImageWidth, ImageHeight: Double;
// Open the file in direct access mode and store the file handle
FH := QP.DAOpenFile('Xpod1228090001.pdf', '');
// Loop through all the pages
for PageNum := 1 to QP.DAGetPageCount(FH) do
begin
// Start a new document
QP.NewDocument;
// Specify that images should be compressed
QP.CompressImages(1);
// Get a page reference to the current page
PR := QP.DAFindPage(FH, PageNum);
// Create a string list to hold the text data
SL := TStringList.Create;
try
// Extract the text from the current page
SL.Text := QP.DAExtractPageText(FH, PR, 4);
// Add each block of text to the new documen
for X := 0 to SL.Count - 1 do
begin
Data := SL[X];
Font := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Font) + 1);
Color := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Color) + 1);
Size := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Size) + 1);
X1 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(X1) + 1);
Y1 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Y1) + 1);
X2 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(X2) + 1);
Y2 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Y2) + 1);
X3 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(X3) + 1);
Y3 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Y3) + 1);
X4 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(X4) + 1);
Y4 := Copy(Data, 1, Pos(',', Data) - 1);
Delete(Data, 1, Length(Y4) + 1);
Text := Copy(Data, 2, Length(Data) - 2);
// Replace the utf-8 encoded TM symbol with the
// PDF WinAnsi character code
if Pos(#226#132#162, Text) > 0 then
Text := StringReplace(Text, #226#132#162, #153,
[rfReplaceAll]);
// Set the text size
QP.SetTextSize(StrToFloat(Size));
// Draw the text, shift up by the font's "descent" value
QP.DrawText(StrToFloat(X1),
StrToFloat(Y1) - QP.GetTextDescent,
Text);
end;
finally
SL.Free;
end;
// Find all the images on the page
IL := QP.DAGetPageImageList(FH, PR);
// Loop through all the images
for X := 1 to QP.DAGetImageListCount(FH, IL) do
begin
// Read the image data
ImageData := QP.DAGetImageDataToString(FH, IL, X);
// Add the image data to the new document
QP.AddImageFromString(ImageData, 0);
// Determine the location and size of the image on the page
ImageLeft := QP.DAGetImageDblProperty(FH, IL, X, 501);
ImageTop := QP.DAGetImageDblProperty(FH, IL, X, 502);
ImageWidth := QP.DAGetImageDblProperty(FH, IL, X, 503) -
QP.DAGetImageDblProperty(FH, IL, X, 501);
ImageHeight := QP.DAGetImageDblProperty(FH, IL, X, 502) -
QP.DAGetImageDblProperty(FH, IL, X, 508);
// Draw the image onto the new document's page
QP.DrawImage(ImageLeft, ImageTop, ImageWidth, ImageHeight);
end; // End image loop
// Compress the page description commands
QP.CompressContent;
// Save the file
QP.SaveToFile('XPod-' + IntToStr(PageNum) + '.pdf');
// Remove the document
QP.RemoveDocument(QP.SelectedDocument);
end; // End page loop
That is all that is required. This code could be enhanced further to replicate any bold or italic text using the same functions, but retrieving more of the text/font properties.
C# Code
QP.UnlockKey(".........LicenceKey.....");
int FileHandle = QP.DAOpenFile("C:\\Input.pdf", "");
int PageRef = 0;
string PageText="";
string Font = "";
string color = "";
string size = "";
string x1, y1, x2, y2, x3, y3, x4, y4;
string word = "";
byte[] ImageData;
for (int i = 1; i <= QP.DAGetPageCount(FileHandle); i++)
{
// Create a emptly document in memory
int NewId = QP.NewDocument();
QP.CompressImages(1);
// Get Page reference
PageRef = QP.DAFindPage(FileHandle, i);
// Get Page Text in CSV format. with the following Format
//Font Name, Text Color, Text Size, X1, Y1, X2, Y2, X3, Y3, X4, Y4, Text
PageText = QP.DAExtractPageText(FileHandle, PageRef, 4);
string[] TextLines = PageText.Split(Environment.NewLine.ToCharArray());
foreach (string line in TextLines)
{
try
{
// parse required for each line
string[] DataFields = line.Split(",".ToCharArray());
Font = DataFields[0];
color = DataFields[1];
size = DataFields[2].Trim();
x1 = DataFields[3];
y1 = DataFields[4];
x2 = DataFields[5];
y2 = DataFields[6];
x3 = DataFields[7];
y3 = DataFields[8];
x4 = DataFields[9];
y4 = DataFields[10];
word = DataFields[11].Replace("\"", "");
// Set text font size
QP.SetTextSize(Convert.ToDouble(size));
// Draw text to new docmuent
QP.DrawText(Convert.ToDouble(x1), Convert.ToDouble(y1) - QP.GetTextDescent(), word);
}
catch
{;}
}
// Get image list reference
int ImageListId = QP.DAGetPageImageList(FileHandle,PageRef);
for (int j=1; j<= QP.DAGetImageListCount(FileHandle,ImageListId);j++)
{
// Convet image data into byte array
ImageData = QP.DAGetImageDataToString(FileHandle, ImageListId, j);
QP.AddImageFromString(ImageData, 0);
// Retrieve images top,left, height and width
double imgLeft = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
double imgTop = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502);
double imgWidth = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 503) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 501);
double imgHeight = QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 502) - QP.DAGetImageDblProperty(FileHandle, ImageListId, j, 508);
// Draw image to new document
QP.DrawImage(imgLeft, imgTop, imgWidth, imgHeight);
}
QP.CompressContent();
// Save each page to new file increment by one
QP.SaveToFile("C:\\XPod"+Convert.ToString(i)+".pdf");
QP.RemoveDocument(QP.SelectDocument(NewId));
}