Hot questions for Using PDFBox in parsing

Question:

I've been working around PDF parsing since late last week. Managed to find the Apache PDFBox library for Java and I already got to extract text separated by page, URLs, images and PDF Metadata that I needed for the project I'm developing. Now I'm missing a way to extract embedded flash videos from a PDF.

I'm currently analysing how this parser extracts rich media from PDFs, using, for test purposes the pdf file available here. This file contains a flash video which I intended to fetch.

I already tried using this approach which searches for embedded files inside the PDF but it is currently not working for me as it finds and saves nothing inside the folder I created to store this kind of files.

What my code currently looks like, adapted from the approach mentioned above.

package myproject;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;

/**
 * This is an example on how to extract all embedded files from a PDF document.
 *
 */
public final class ExtractEmbeddedFiles
{
    private ExtractEmbeddedFiles()
    {
    }

    /**
     * This is the main method.
     *
     * @param args The command line arguments.
     *
     * @throws IOException If there is an error parsing the document.
     */
    public static void main( String[] args ) throws IOException
    {
            PDDocument document = null;
            try
            {
                File pdfFile = new File("/Users/henriqueferreira/Documents/PDFBoxDocuments/inOntario.pdf");
                String filePath = pdfFile.getParent() + System.getProperty("file.separator");
                document = PDDocument.load(new File("/Users/henriqueferreira/Documents/PDFBoxDocuments/inOntario.pdf"));
                PDDocumentNameDictionary namesDictionary = 
                        new PDDocumentNameDictionary( document.getDocumentCatalog() );
                PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
                if (efTree != null)
                {
                    Map<String, PDComplexFileSpecification> names = efTree.getNames();
                    if (names != null)
                    {
                        extractFiles(names, filePath);
                    }
                    else
                    {
                        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
                        for (PDNameTreeNode<PDComplexFileSpecification> node : kids)
                        {
                            names = node.getNames();
                            extractFiles(names, filePath);
                        }
                    }
                }

                // extract files from annotations
                for (PDPage page : document.getPages())
                {
                    for (PDAnnotation annotation : page.getAnnotations())
                    {
                        if (annotation instanceof PDAnnotationFileAttachment)
                        {
                            PDAnnotationFileAttachment annotationFileAttachment = (PDAnnotationFileAttachment) annotation;
                            PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) annotationFileAttachment.getFile();
                            PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
                            extractFile(filePath, fileSpec.getFilename(), embeddedFile);
                        }
                    }
                }

            }
            finally
            {
                if( document != null )
                {
                    document.close();
                }
            }
    }

    private static void extractFiles(Map<String, PDComplexFileSpecification> names, String filePath) 
            throws IOException
    {
        for (Entry<String, PDComplexFileSpecification> entry : names.entrySet())
        {
            String filename = entry.getKey();
            PDComplexFileSpecification fileSpec = entry.getValue();
            PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec);
            extractFile(filePath, filename, embeddedFile);
        }
    }

    private static void extractFile(String filePath, String filename, PDEmbeddedFile embeddedFile)
            throws IOException
    {
        String embeddedFilename = filePath + filename;
        File file = new File("/Users/henriqueferreira/Documents/PDFBoxFiles/"+filename);
        System.out.println("Writing " + embeddedFilename);
        try (FileOutputStream fos = new FileOutputStream(file))
        {
            fos.write(embeddedFile.toByteArray());
        }
    }

    private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpec )
    {
        // search for the first available alternative of the embedded file
        PDEmbeddedFile embeddedFile = null;
        if (fileSpec != null)
        {
            embeddedFile = fileSpec.getEmbeddedFileUnicode(); 
            if (embeddedFile == null)
            {
                embeddedFile = fileSpec.getEmbeddedFileDos();
            }
            if (embeddedFile == null)
            {
                embeddedFile = fileSpec.getEmbeddedFileMac();
            }
            if (embeddedFile == null)
            {
                embeddedFile = fileSpec.getEmbeddedFileUnix();
            }
            if (embeddedFile == null)
            {
                embeddedFile = fileSpec.getEmbeddedFile();
            }
        }
        return embeddedFile;
    }
}

So, my question is, what should be the most appropriate method to get this type of flash videos from a PDF file?


Answer:

Here's some quick code based on what I saw with PDFDebugger:

PDDocument doc = PDDocument.load(new File("Mississauga_Advantages.pdf"));
for (int p = 0; p < doc.getNumberOfPages(); ++p)
{
    PDPage page = doc.getPage(p);
    List<PDAnnotation> annotations = page.getAnnotations();
    for (PDAnnotation ann : annotations)
    {
        if ("RichMedia".equals(ann.getSubtype()))
        {
            COSArray array = (COSArray) ann.getCOSObject().getObjectFromPath("RichMediaContent/Assets/Names/");
            String name = array.getString(0);
            COSDictionary filespec = (COSDictionary) array.getObject(1);
            PDComplexFileSpecification cfs = new PDComplexFileSpecification(filespec);
            PDEmbeddedFile embeddedFile = cfs.getEmbeddedFile();
            System.out.println("page: " + (p+1) + ", name: " + name + ", size: " + embeddedFile.createInputStream().available());
        }
    }
}

Your rich media is in an annotation. So I went through the list and looked for the pattern that I saw. I don't know if this is standard or not, I didn't find it in the PDF 32000 specification. (Update: I found it here after writing the code)

Question:

Yeah, I know this is repeated question. But still I need to understand a lot about horizontal parsing. I am expecting full clear answer here.

I have some example content stream like below:

Example1:

BT
/F33 20.665 Tf
72 633.8289 Td
[(Chapter)-375(12)]TJ
/F33 24.78709 Tf
0 51.30099 Td
[(P)31(arametric)-375(and)-375(P)32(olar)-375(Curv)31(es)]TJ

Example2:

BT
/C0_1 14 Tf
39.812999 681.73999 Td
[(\000"\000M\000U\000I\000P\000V\000H\000I\000\001)-82(\000$\000B\000S\000P\000V\000T\000F\000M\000\0001)-82(\000X\000B\000T\000\001)-82.07099........]TJ

The formula for parsing horizontally(tx) is

Now I want to substitute values behalf of example1 :

W0 = ?(here mkl mentioned w0 means width of the respective character from the width array. How can I get the lengths. what are the different values for above 3 examples. How can I get from existing pdf. How can I get character width from CMAP's.)

Tj = The numbers in TJ array.

Tfs = use the font size from the graphics state which is the font size parameter from the relevant Tf operation, e.g. 10.

Tc = use the value from the graphics state which is the parameter from the relevant Tc or " operation.

Tw = use 0 or (in case of a single-byte character code 32) the value from the graphics state which is the parameter from the relevant Tw or " operation.

Th = use the value from the graphics state which is the parameter from the relevant Tz operation divided by 100.

Please write step by step solution for each example and if possible, explain with all types of TJ arrays(what types we may see in content stream) PDF's are using. I read the concept from PDF32000_2008(9.4.4 Text Space Details) still I am in confusion sate. You can found the actual pdf's in below link

Example 1 file

Example2 file


Answer:

It sounds like you most of all wonder where to retrieve the widths, the w0 values from.

This actually is easy, the widths arrays are in the PDF font objects! In case of simple fonts the width values are in the Widths array. The only exception are the standard 14 fonts. For them a PDF processor is expected to know the widths of the glyphs. In case of CID fonts the width values are in the W array defaulting to the DW value defaulting to 1000.

In case of Type 1, TrueType, and CID fonts the widths are measured in units in which 1000 units correspond to 1 unit in text space.

In case of Type 3 fonts these widths shall be interpreted in glyph space as specified by FontMatrix; but as a note there indicates a common practice is to define glyphs in terms of a 1000-unit glyph coordinate system, in which case the font matrix is [0.001 0 0 0.001 0 0] which gives rise to the same 1000:1 ratio as above.

Your first example
/F33 20.665 Tf
72 633.8289 Td
[(Chapter)-375(12)]TJ

Here the font F33 is selected with font size 20.665 in the first instruction. That font is defined in object 21:

24 0 obj
[656.2 625 625 937.5 937.5 312.5 343.7 562.5 562.5 562.5 562.5 562.5 849.5 500 574.1 812.5 875 562.5 1018.5 1143.5 875 312.5 342.6 581 937.5 562.5 937.5 875 312.5 437.5 437.5 562.5 875 312.5 375 312.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 312.5 312.5 342.6 875 531.2 531.2 875 849.5 799.8 812.5 862.3 738.4 707.2 884.3 879.6 419 581 880.8 675.9 1067.1 879.6 844.9 768.5 844.9 839.1 625 782.4 864.6 849.5 1162 849.5 849.5 687.5 312.5 581 312.5 562.5 312.5 312.5 546.9 625 500 625 513.3 343.7 562.5 625 312.5 343.7 593.7 312.5 937.5 625 562.5 625 593.7 459.5 443.8 437.5 625 593.7 812.5 593.7 593.7 500]
endobj
21 0 obj
<<
/BaseFont /HLFPHX+CMBX12
/FirstChar 11
/FontDescriptor 22 0 R
/LastChar 122
/Subtype /Type1
/Type /Font
/Widths 24 0 R
>> 

Thus, the glyph with code 11 has a width of .6562, the one with code 12 is .625 units wide, etc.

So at the beginning the text matrix and text line matrix point to (0, 0). After 72 633.8289 Td they point to (72, 633.8289). This is where 'C' is drawn.

Drawing 'C' advances the position the text matrix points to by a tx value of

((w0 - Tj/1000) × Tfs + Tc + Tw) * Th

The 'C' we see in the instruction parameter actually is the byte 0x43 = 67. Thus in the Widths array we find the w0 value at 1000:1 at index 56 (0-based), 812.5. There are no numeric parameters immediately following the 'C', so Tj is 0. Tfs is 20.665. Tc and Tw both are 0. Th is 1.

Thus, tx is ((.8125 - 0) × 20.665 + 0 + 0) × 1 = 16.7903125 and drawing 'C' advances the position the text matrix points to to (88.7903125, 633.8289). This is where 'h' is drawn.

Similarly drawing 'h' advances the position by tx = ((.625 - 0) × 20.665 + 0 + 0) × 1 = 12.915625 to (101.7059375, 633.8289). This is where 'a' is drawn.

Drawing 'a' advances the position by tx = ((.5469 - 0) × 20.665 + 0 + 0) × 1 = 11.3016885 to (113.007626, 633.8289). This is where 'p' is drawn.

Drawing 'p' advances the position by tx = ((.625 - 0) × 20.665 + 0 + 0) × 1 = 12.915625 to (125.923251, 633.8289). This is where 't' is drawn.

Drawing 't' advances the position by tx = ((.4375 - 0) × 20.665 + 0 + 0) × 1 = 9.0409375 to (134.9641885, 633.8289). This is where 'e' is drawn.

Drawing 'e' advances the position by tx = ((.5133 - 0) × 20.665 + 0 + 0) × 1 = 10.6073445 to (145.571533, 633.8289). This is where 'r' is drawn.

Drawing 'r' considering the numeric parameter -375 advances the position by tx = ((.4595 - (-375/1000)) × 20.665 + 0 + 0) × 1 = 17.2449425 to (162.8164755, 633.8289). This is where '1' is drawn.

Drawing '1' advances the position by tx = ((.5625 - 0) × 20.665 + 0 + 0) × 1 = 11.6240625 to (174.440538, 633.8289). This is where '2' is drawn.

Drawing '2' advances the position by tx = ((.5625 - 0) × 20.665 + 0 + 0) × 1 = 11.6240625 to (186.0646005, 633.8289).

Following this the instruction /F33 24.78709 Tf changes the text font size to 24.78709 and the instruction 0 51.30099 Td advances the position of the text line matrix and text matrix to (72, 582.52791). This is where 'P' is drawn.

Drawing 'P' considering the numeric parameter 31 advances the position by tx = ((.7685 - (31/1000)) × 24.78709 + 0 + 0) × 1 = 18.280478875 to (90.280478875, 582.52791). This is where 'a' is drawn.

...

Your second example
/C0_1 14 Tf
39.812999 681.73999 Td
[(\000"\000M\000U\000I\000P\000V\000H\000I\000\001)-82(\000$\000B\000S\000P\000V\000T\000F\000M\000\001)-82(\000X\000B\000T\000\001)-82.07099........]TJ

Here the font C0_1 is selected with font size 14. This font is composite, its descendant font is defined in object 24:

24 0 obj
<<
/BaseFont /NFAHTB+MinionPro-Regular
/CIDSystemInfo 30 0 R
/DW 1000
/FontDescriptor 31 0 R
/Subtype /CIDFontType0
/Type /Font
/W [0 [500 227 276 318]
 4 5 480 6 [756 711 223]
 9 10 346
11 [404 580 228 356 228 331]
 17 26 480 27 28 228 29 [552 580 552 379 753 691 588 665 735 568
529 715 766 341 329 673 538 891 743 747
563 745 621 474 617 736 703 971 654 634
603 345 333 345 566 500 224 439 508 423
528 425 296 468 534 268 256 496 253 819
547 510 524 511 371 367 305 531 463 685
472 459 420 347 263 347 580 276]
97 98 480 99 [159]
 100 101 480 102 [477 480 169 398 444]
107 108 279 109 [535 533 520 490 489 226 497 390 239 429
401 445 970 1062 379]
 124 136 400 137 [922 869 305 550 749 973 334 671 268 273
513 770 545 341 580 512 459 737 762 580
549 762 580 263 343 514 762 341 321 580
505 580 341 702]
171 176 691 177 [661]
 178 181 568 182 185
341 186 [743]
 187 191 747 192 [474]
 193 196
736 197 198 634 199 [603]
 200 205 439 206
[421]
 207 210 425 211 214 268 215 [547]
 216
220 510 221 [367]
 222 225 531 226 227 459
228 [420 503 500 480 418]
 233 238 762 239 [691 926 666 627 737 736 766 613 518 637
606 499 1029 763 493 267 526 541 533 525
547 303 385 669 1071 914 876 722 803 561
1071 1081 798 787 1045 801 852 814 535 520
778 533 582 522 856 664 804 814 533 777]
 289 290 533
291 [578]
 292 293 800 294 298 480 299 [828 439 790 565 511 531 584 482 456 565
621 306 297 558 460 709 580 584 484 585
528 408 510 582 567 761 551 511 493 611
621 306 582 510 579 611 481 431 815 723
776 268 606 603 622 242 235 345 346 530
340 446 406 486 403 499 437 466 486 473
468 529 486 481 489 528 483 481 519 710
1009 711 493 338 465 452 497 454 495 464
475 488 493 480 479 574 480 482 480 568
483 486 482]
392 411 486 412 [305 349 355]
 415 416 292 417 [306 372 194 192 543 371 334 262 265 228]
427 436 341 437 [178 177]
 439 440 341 441 [259]
442 443 245 444 453 341 454 [178 177]
 456 457
341 458 [259]
 459 460 245 461 470 341 471
[178 177]
 473 474 341 475 [259]
 476 477 245 478
487 341 488 [178 177]
 490 491 341 492 [259]
 493
494 245 495 497 606 498 [454 469 407 563]
 502 507 691
508 [1058 813]
 510 512 691 513 520 766 521 [566 766]
523 526 757 527 [640 757 598 681]
 531 532 652 533 534
877 535 536 631 537 540 757 541 542 510
543 [256]
 544 545 846 546 [753 922 520 276 444 445]
 552 553 279
554 [356 379]
 556 557 347 558 559 345 560 561
346 562 [226]
 563 564 579 565 [586 587 760 556 375 490 718 561 536 641
757 531 568]
 578 580
691 581 [722]
 582 585 665 586 [735]
 587 591
568 592 596 715 597 [766]
 598 602 341 603
[329 673]
 605 608 538 609 [891]
 610 613 743 614
616 747 617 [749]
 618 620 621 621 [474 477]
 623
624 474 625 626 617 627 629 736 630 [733]
631 632 736 633 636 971 637 639 634 640
641 603 642 [869]
 643 644 1071 645 647 439
648 [512]
 649 652 423 653 [528]
 654 657 425
658 [424]
 659 663 468 664 [534]
 665 668 268
669 [258 496]
 671 673 253 674 [271 819]
 676 679 547
680 682 510 683 [513]
 684 686 371 687 [367 366]
689 690 367 691 692 305 693 698 531 699
702 685 703 705 459 706 707 420 708 [671 367]
710 711 492 712 724 400 725 728 565 729
[723]
 730 731 565 732 [568]
 733 734 565 735
[643]
 736 737 531 738 [528]
 739 740 531 741
[584]
 742 749 482 750 [487]
 751 755 565 756
[621]
 757 760 306 761 [474]
 762 763 306 764
[308 306 297 558]
 768 771 460 772 [478 709]
 774 778 580 779
785 584 786 [582 584]
 788 790 528 791 792 408
793 [412]
 794 795 408 796 797 510 798 804
582 805 [584]
 806 807 582 808 811 761 812
816 511 817 819 493 820 [401 402 401 381 401 375 404 400 401 400
401 400 367 401 691 588 507 641 568 603
766 739 341 673 686 891 743 607 747 738
563 598 617 655 754 654 725 757 691 568
766]
 861 862 341
863 [747]
 864 865 655 866 [757]
 867 873 691
874 882 910 883 887 691 888 895 568 896
901 766 902 910 972 911 914 766 915 926
341 927 932 757 933 941 1007 942 945 757
946 953 747 954 [563 341]
 956 963 655 964 [341 329 889 959 776 650 653 741 691 580
588 512 649 568 954 518]
980 981 752 982 [650 645 891 766 747 735 563 665 617 523
510 495 497 403 381 509 490 245]
 1000 1001 493 1002 [512 476 404 510 501 515 446 481 587 467
605 645 403 497 496 582 665 404 508 669
544 453 523 403 509]
1027 1028 245 1029 [510]
 1030 1031 481 1032 [645 245 481]
1035 1042 523 1043 1048 403 1049 1056 509 1057
1064 245 1065 1070 510 1071 1078 481 1079 1086
645 1087 1088 523 1089 1090 403 1091 1092 509
1093 1094 245 1095 1096 510 1097 1098 481 1099
1100 645 1101 1108 523 1109 1116 509 1117 1124
645 1125 1130 523 1131 1135 509 1136 1141 245
1142 1145 481 1146 1147 501 1148 [481]
 1149 1153
645 1154 [523 481]
 1156 1159 230 1160 1171 400 1172
[353]
 1173 1177 400 1178 1179 405 1180 [400 653 767 654 741 666 958 960 720 840
581 644 956 636 439 501 486 389 490 425
726 408]
 1202
1203 555 1204 [500 494 640 553 510 552 524 423 441 459
672 472 556 507 771 775 566 681 468 440
707 500 425 500 389 449 367]
 1231 1232 268 1233 [256 673 719 533 500 468 545 689 547 736
511 680 467 477 366 428 356 411 872 974
1124 1133 957 457 603 623 830 1006 806 1408
1744 1095 643 566 821 836 906 1602 1675 1584
427 892]
 1275
1276 745 1277 [465 619 776 427 341 566 892]
 1284 1287 400 1288 [747 736 525 547]
 1292
1293 480 1294 1305 691 1306 1313 568 1314 1315
341 1316 1327 747 1328 1334 736 1335 1337 634
1338 1349 439 1350 1357 425 1358 1359 268 1360
1366 510 1367 1371 525 1372 1373 531 1374 1378
547 1379 1381 459 1382 1393 637 1394 1401 606
1402 1413 565 1414 1421 482 1422 1423 306 1424
1436 584 1437 1444 582 1445 1447 511 1448 1457
400 1458 [392]
 1459 1480 400 1481 [565 511 531 584 482 456 565 621 306 297
558 460 709 580 584 484 585 528 408 510
582 567 761 551 511 493 611 621 582 510
579 611 481 431 723 776 603]
 1518 1521
565 1522 [723]
 1523 1524 565 1525 [568]
 1526 1528
565 1529 1530 531 1531 [528]
 1532 1533 531 1534
[584]
 1535 1542 482 1543 [487]
 1544 1548 565 1549
[621]
 1550 1556 306 1557 [308 306 297 558]
 1561 1564 460 1565
[478 709]
 1567 1571 580 1572 1578 584 1579 [582 584]
 1581
1583 528 1584 1585 408 1586 [412]
 1587 1588 408
1589 1590 510 1591 1597 582 1598 [584]
 1599 1600
582 1601 1604 761 1605 1609 511 1610 1612 493
1613 1624 565 1625 1632 482 1633 1634 306 1635
1647 584 1648 1655 582 1656 1658 511 1659 [477 366 617 305 356 227 400 159 226 306
159]
1670 1671 105 1672 [495 565 762 916 297 223 480 461 480 486
480 472 468 486]
]
>>
endobj 

Thus, the glyph with code 0 has a width of 500, code 1 has 227, code 2 has 276, code 3 has 318, code 4 and 5 have 480, code 6 has 756, etc.

Furthermore, it is important that the font encoding is Identity-H which is a pure two-byte encoding.

Thus, at the beginning the text matrix and text line matrix point to (0, 0). After 39.812999 681.73999 Td they point to (39.812999, 681.73999). This is where '\000"' = 0x0022 is drawn.

Drawing 0x0022 advances the position by tx = ((.691 - 0) × 14 + 0 + 0) × 1 = 9.674 to (49.486999, 681.73999). This is where '\000M' = 0x004d is drawn.

...


As you can see, the cases in your example files are simple:

  • no use of character or word spacing, no horizontal scaling;
  • very simple text and text line matrices, plain translation;
  • no changes to the current transformation matrix;
  • no standard 14 fonts;
  • ...

Nonetheless, the concept should have become clear.

And I hope I have not miscalculated too often... ;)

Question:

I gone through the content stream of different pdf's which contain graphics elements. Some pdf's contain normal CTM text coordinate system for graphics. Like below Here CTM positions I am able to compare with my page coordinates.

But I found something strange in this pdf(the x and y transisitions are in thousands and my page coordinates are 576, 720. How I can compare with page coordinates? ). You can see below . In this case how the CTM is calculation happening .

And I saw in rules like "A conforming reader or writer of a PDF content stream may change an arrangement of graphics state operators to any other arrangement that achieves the same values of the relevant graphics state parameters for each graphics object."

Can anyone please explain about other such cases where graphics parsing act like this and what considerations need to be taken to handle it in a generic manner ?

Please explain all ways of parsing available graphics coordinates.


Answer:

Please explain all ways of parsing available graphics coordinates.

There essentially is but one way to do that, the way implied by the PDF specification: When reading a content stream, update the current transformation matrix (CTM) according to the effects of the instructions you find.

Let's look at your second content stream.

Initial value

At the beginning the CTM maps the default user space to the device space. As we are interested in coordinates in the default user space itself, for us these spaces coincide and we start with the identity matrix. Furthermore, there are no saved graphics states and therefore no CTM values in saved states yet:

1 0 0        |
0 1 0        |
0 0 1        |
(0) q

The first instruction q saves the current graphics state; thus, we now have a copy of the CTM on the graphics stack:

1 0 0        |    1 0 0
0 1 0        |    0 1 0
0 0 1        |    0 0 1
(1) .1 0 0 .1 0 0 cm

The next instruction .1 0 0 .1 0 0 cm multiplies the CTM from the left:

.1 0  0     1 0 0     .1 0  0
0  .1 0  *  0 1 0  =  0  .1 0
0  0  1     0 0 1     0  0  1

Thus, we have

.1 0  0        |    1 0 0
0  .1 0        |    0 1 0
0  0  1        |    0 0 1
(2..6) ... re W n ... rg ... gs

These instructions don't change the CTM or the state stack.

(7) q

The next instruction q saves the current graphics state; thus

.1 0  0        |    1 0 0    .1 0  0
0  .1 0        |    0 1 0    0  .1 0
0  0  1        |    0 0 1    0  0  1

(I draw the top of the stack on the right side.)

(8) 1 0 0 1 3398 2608 cm

(For the sake of brevity I truncate the values a bit.)

The next instruction 1 0 0 1 3398 2608 cm multiplies the CTM from the left:

   1    0 0     .1 0  0        .1   0   0
   0    1 0  *  0  .1 0  =    0      .1 0
3398 2606 1     0  0  1     339.8 260.6 1

Thus, we now have

   .1   0   0        |    1 0 0    .1 0  0
  0      .1 0        |    0 1 0    0  .1 0
339.8 260.6 1        |    0 0 1    0  0  1

This is the first instruction you were unsure about because of the value in the thousands. After evaluating, though, you see the origin being pushed to fairly normal values 339.8 260.6.

(9..13) ... m ... l ... l h f*

These instructions don't change the CTM or the state stack.

(14) Q

The next instruction Q restores the most recently saved graphics state. Thus, we have

.1 0  0        |    1 0 0
0  .1 0        |    0 1 0
0  0  1        |    0 0 1
(15..17) ... RG ... w ... M

These instructions don't change the CTM or the state stack.

(18) q

The next instruction q saves the current graphics state; thus

.1 0  0        |    1 0 0    .1 0  0
0  .1 0        |    0 1 0    0  .1 0
0  0  1        |    0 0 1    0  0  1
(19) 1 0 0 1 3607 2339 cm

(For the sake of brevity I truncate the values a bit.)

The next instruction 1 0 0 1 3607 2339 cm multiplies the CTM from the left:

   1    0 0     .1 0  0        .1   0   0
   0    1 0  *  0  .1 0  =    0      .1 0
3607 2339 1     0  0  1     360.7 233.9 1

Thus, we now have

   .1   0   0        |    1 0 0    .1 0  0
  0      .1 0        |    0 1 0    0  .1 0
360.7 233.9 1        |    0 0 1    0  0  1

This is the second instruction you were unsure about because of the value in the thousands. After evaluating, though, you again see the origin being pushed to fairly normal values 360.7 233.9.

(20..) etc.

Question:

I have several bank statements from our users. I am trying to figure out a way to parse the rows of transactions. I have used PDFBox previously using TextArea, TextStripper, but i am not sure how to proceed with bank statements since they will have an undetermined number of rows and the rows may or maynot be of fixed size.


Answer:

i wrote just such a parser to parse our chase pdf credit card statements, to speed up the tax-preparation time, with the help of an open source project called Apache Tika.

just need to include tika and pdf parser in your pom.xml dependencies:

        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-core</artifactId>
            <version>1.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.tika</groupId>
            <artifactId>tika-parsers</artifactId>
            <version>1.17</version>
        </dependency>

the PDF extractor is fairly straightforward also:

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;


public class PdfExtractor {
    private static Logger logger = LoggerFactory.getLogger(PdfExtractor.class);

    public static void main(String args[]) throws Exception {
        StopWatch sw = new StopWatch();
        List<String> files = new ArrayList<>();
        files.add("C:/Users/m/Downloads/20170115.pdf");
        files.add("C:/Users/m/Downloads/20170215.pdf");
        files.add("C:/Users/m/Downloads/20170315.pdf");
        files.add("C:/Users/m/Downloads/20170415.pdf");
        files.add("C:/Users/m/Downloads/20170515.pdf");
        files.add("C:/Users/m/Downloads/20170615.pdf");
        files.add("C:/Users/m/Downloads/20170715.pdf");
        files.add("C:/Users/m/Downloads/20170815.pdf");
        files.add("C:/Users/m/Downloads/20170915.pdf");
        files.add("C:/Users/m/Downloads/20171015.pdf");
        files.add("C:/Users/m/Downloads/20171115.pdf");
        files.add("C:/Users/m/Downloads/20171215.pdf");
        files.add("C:/Users/m/Downloads/20180115.pdf");
        InputStream is;
        List<ChasePdfParser.ChaseRecord> full = new ArrayList<>();
        for (String fileName : files) {
            logger.info("Now processing " + fileName);
            is = new FileInputStream(fileName);
            ContentHandler contenthandler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            PDFParser pdfparser = new PDFParser();
            pdfparser.parse(is, contenthandler, metadata, new ParseContext());
            String data = contenthandler.toString();
            List<ChasePdfParser.ChaseRecord> chaseRecords = ChasePdfParser.parse(data);
            full.addAll(chaseRecords);
            is.close();
        }
        logger.info("Total processing time: " + PrettyPrinter.toMsSoundsGood(sw.getTime()));
        full.forEach(cr -> System.err.println(cr.date + "|" + cr.desc + "|" + cr.amt));
    }
}

The line parser also fairly straight-forward, since each line has all the necessary info, it's easy to parse it:

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class ChasePdfParser {
    private static Logger logger = LoggerFactory.getLogger(ChasePdfParser.class);

    private static int FOR_TAX_YEAR = 2017;
    private static String YEAR_EXTENSION = "/" + FOR_TAX_YEAR;
    private static DateTimeFormatter check = DateTimeFormatter.ofPattern("MM/dd/uuuu");
    private static List<String> exclusions = new ArrayList<>(Arrays.asList("Payment Thank You", "AUTOMATIC PAYMENT"));

    public static List<ChaseRecord> parse(String data) {
        List<ChaseRecord> l = new ArrayList<>();
        for (String line : data.split("\n")) {
            if (line.isEmpty()) continue;
            String[] split = line.split("\\s");
            if (split == null || split.length == 0) continue;
            String test = split[0];
            if (!isMMDD(test)) continue;
            if(skip(line)) continue;
            if (split.length < 4) continue;
            ChaseRecord cr = new ChaseRecord();
            cr.date = extractDate(test);
            try {
                String last = split[split.length - 1];
                last = last.replaceAll(",", "");
                cr.amt = Double.parseDouble(last);
            } catch (NumberFormatException e) {
                e.printStackTrace();
            }
            cr.desc = String.join(" ", Arrays.copyOfRange(split, 1, split.length - 1));
            cr.desc = cr.desc.replaceAll("\\s\\s+", " ");
            l.add(cr);
        }
        return l;
    }

    private static boolean skip(String s) {
        if (s == null || s.isEmpty()) {
            return true;
        }
        for (String e : exclusions) {
            if (s.contains(e)) {
                return true;
            }
        }
        return false;
    }

    protected static LocalDate extractDate(String s) {
        if (!isMMDD(s)) {
            return null;
        }
        LocalDate localDate = LocalDate.parse(s + YEAR_EXTENSION, check);
        return localDate;
    }

    public static boolean isMMDD(String s) {
        if (s == null || s.isEmpty() || s.length() != 5) {
            return false;
        }
        try {
            s += YEAR_EXTENSION;
            LocalDate.parse(s, check);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    public static class ChaseRecord {
        public LocalDate date;
        public String desc;
        public Double amt;

        @Override
        public String toString() {
            return "ChaseRecord{" +
                    "date=" + date +
                    ", desc='" + desc + '\'' +
                    ", amt=" + amt +
                    '}';
        }
    }
}

Question:

I have PDF:s with a extremely large tokens plastered across the entire front page of many pdf documents, see image. I'm looking for an automated method to remove these.

Apache PDFBox has a pretty extensive API, is there any way to match these tokens by Regex and simply remove them and re-save the pdf?

Image from PDF Example posted below. The tokens I'd like to remove are: [KS/2019:589] Lokalvård Grundskolor & Idrottshallar that are plastered on top of the regular text. Google Drive link to full PDF-file.


Answer:

You can use the PdfContentStreamEditor class from this answer (don't forget to apply the fix mentioned at the bottom of the answer) like this:

try (   PDDocument document = ...   ) {
    PDPage page = document.getPage(0);
    PdfContentStreamEditor editor = new PdfContentStreamEditor(document, page) {
        @Override
        protected void write(ContentStreamWriter contentStreamWriter, Operator operator, List<COSBase> operands) throws IOException {
            String operatorString = operator.getName();

            if (TEXT_SHOWING_OPERATORS.contains(operatorString))
            {
                float fs = getGraphicsState().getTextState().getFontSize();
                Matrix matrix = getTextMatrix().multiply(getGraphicsState().getCurrentTransformationMatrix());
                Point2D.Float transformedFsVector = matrix.transformPoint(0, fs);
                Point2D.Float transformedOrigin = matrix.transformPoint(0, 0);
                double transformedFs = transformedFsVector.distance(transformedOrigin);
                if (transformedFs > 50)
                    return;
            }

            super.write(contentStreamWriter, operator, operands);
        }

        final List<String> TEXT_SHOWING_OPERATORS = Arrays.asList("Tj", "'", "\"", "TJ");
    };
    editor.processPage(page);
    document.save(...);
}

(EditPageContent test testRemoveBigTextKommersAnnonsElite)

You can find some explanations in the referenced answer.

Question:

I'm parsing a PDF using PDFBox and I'm trying to get the text color. I can get other properties like font, size, and position no problem using TextPosition attributes. Here's how I'm doing it:

@Override
protected void writeString (String string, List<TextPosition> textPositions) {

    for (TextPosition textPosition : textPositions) {

        System.out.println(textPosition.getFont());
        System.out.println(textPosition.getFontSizeInPt());
        System.out.println(textPosition.getXDirAdj() + ", " + textPosition.getYDirAdj());

    }

However, I'm unable to retrieve the color of the text. I've searched Google for a solution but nothing has worked so far. Every tutorial I see seems to be using an old version of PDFBox. I don't have several of the methods that these people are using. For example, in one SO question they recommended using this code:

@Override
protected void processTextPosition(TextPosition text) {

    try {
        PDGraphicsState graphicsState = getGraphicsState();
        System.out.println("R = " + graphicsState.getNonStrokingColor().getJavaColor().getRed());
        System.out.println("G = " + graphicsState.getNonStrokingColor().getJavaColor().getGreen());
        System.out.println("B = " + graphicsState.getNonStrokingColor().getJavaColor().getBlue());
    }

    catch (IOException ioe) {}

}

When I try to use this, IntelliJ tells me "getJavaColor()" is undefined. I have also tried with this code:

@Override
protected void processTextPosition(TextPosition text) {

    try {
        PDGraphicsState graphicsState = getGraphicsState();
        System.out.println("R = " + graphicsState.getNonStrokingColor().toRGB());
    }
    catch (IOException ioe) {System.out.println(ioe); }

}

And, while the method is getting called as expected, and the expected number of times, it always prints 0, even though in my PDF file I have black text and red text.

Here are my Maven dependencies:

<dependencies>

    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.17</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>fontbox</artifactId>
        <version>2.0.17</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox-tools -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox-tools</artifactId>
        <version>2.0.17</version>
    </dependency>

</dependencies>

Any help is appreciated


Answer:

Apparently in PDFBox 2.0.0+ versions you need to add these lines of code:

addOperator(new SetStrokingColorSpace());
addOperator(new SetNonStrokingColorSpace());
addOperator(new SetStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceRGBColor());
addOperator(new SetStrokingDeviceRGBColor());
addOperator(new SetNonStrokingDeviceGrayColor());
addOperator(new SetStrokingDeviceGrayColor());
addOperator(new SetStrokingColor());
addOperator(new SetStrokingColorN());
addOperator(new SetNonStrokingColor());
addOperator(new SetNonStrokingColorN());

to your PDFTextStripper overwritten class constructor. Now if you use:

@Override
protected void processTextPosition (TextPosition textPosition) {

    try {

        PDGraphicsState graphicsState = getGraphicsState();
        System.out.println(graphicsState.getNonStrokingColor().toRGB());

    }

    catch (Exception ioe) {}

}

it actually prints a real value.