Skip to content

Tabula-java is skipping one row from Table #544

@sikanderrafiq

Description

@sikanderrafiq

HI,
I am using Tabula-java to parse Table in pdf file, but it is skipping one row. Alternative row is fetching perperly.
I have attached my pdf file named murree_ren.pdf.
murree_ren.pdf

This is the code I have used:

public void parse() {

	System.out.println("TabulaPdfParser.parse-----------------------------------");
	
	try {
		File file = new File("D:/Pdfs/murree_ren.pdf");
		FileInputStream inputStream = new FileInputStream(file);
		
		
		PDDocument document = PDDocument.load(inputStream);
		{
			System.out.println("TabulaPdfParser.parse--------------------document loaded---------------");
			
		    SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
		    @SuppressWarnings("resource")
			PageIterator pi = new ObjectExtractor(document).extract();
		    
		    
		    while (pi.hasNext()) {
		        // iterate over the pages of the document
		        Page page = pi.next();
		        List<Table> tables = sea.extract(page);
		        System.out.println("TabulaPdfParser.parse------------------||||-----------------table size=" + tables.size());
		        
		        // iterate over the tables of the page
		        for(Table table: tables) {
		            List<List<RectangularTextContainer>> rows = table.getRows();
		            System.out.println("TabulaPdfParser.parse------------getRowCount=" + table.getRowCount() + " colcount=" + table.getColCount());
		            
		            String str = "";
		            RectangularTextContainer rect;
		            int rowcount = table.getRowCount();
		            int colcount = table.getColCount();
		            
		            for (int i=0; i<rowcount; i++) {
		            	str = "";
		            	for (int j=0; j<colcount; j++) {
		            		rect = table.getCell(i, j);
		            		str = str + rect.getText().replace("\r", " ");
		            		if (j  < (colcount-1)) {
		            			str += "|";
		            		}
		            	}
		            	System.out.println("RowText:----------row no=" + i + " str=" + str);
		            }
		        }
		    }
		    
		}
		
	} catch (Exception ex) {
		System.out.println("Exception:---------------------------------------=" + ex.getMessage());
		
	}

}

Here is the output:

TabulaPdfParser.parse-----------------------------------
TabulaPdfParser.parse--------------------document loaded---------------
TabulaPdfParser.parse------------------||||-----------------table size=109
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=7
RowText:----------row no=0 str=1|39167 24/11/2019|MUHAMMAD MATLOOB|MUHAMMAD TAJ|VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|6/6/1976|F.A
RowText:----------row no=1 str=||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=7
RowText:----------row no=0 str=3|455 18/06/2020|WAHEED ANWAR|ABDUL QADOUS|P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB|12/9/1954|MATRIC
RowText:----------row no=1 str=||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=WAHEED ANWAR|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=ABDUL QADOUS
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=5
RowText:----------row no=0 str=||||
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=5|61134 2020-12-08|AZRA PARVEEN|MUHAMMAD TALIB|V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|30/10/1966|MATRIC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=61134 2020-12-08
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=61134 2020-12-08
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=AZRA PARVEEN|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=8
RowText:----------row no=0 str=7|60305 25/01/2021|AZRA NAHEED|MANZOOR HUSSAIN|BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|17/03/1973|MATRIC|
RowText:----------row no=1 str=|||||||
RowText:----------row no=2 str=|||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MANZOOR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MANZOOR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=3 colcount=9
RowText:----------row no=0 str=9|59463 31/07/2021|MUNAWAR HUSSAIN|MUHAMMAD ABDULLAH|H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|01/03/1974|MATRIC||
RowText:----------row no=1 str=||||||||
RowText:----------row no=2 str=||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUNAWAR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUHAMMAD ABDULLAH
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=11|58306 10/01/2022|MUBASHAR ISHAQ QAMAR|MUHAMMAD ISHAQ|VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|26/02/1992|FSC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUBASHAR ISHAQ QAMAR
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=FSC
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=12
RowText:----------row no=0 str=13|49597 19/06/2022|TAHIR MEHBOOB|MUHAMMAD MEHBOOB|GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|15/7/1988|ICS , FSC HOMEO|||||
RowText:----------row no=1 str=|||||||||||
RowText:----------row no=2 str=|||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=TAHIR MEHBOOB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=TAHIR MEHBOOB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=TAHIR MEHBOOB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=MUHAMMAD MEHBOOB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=ICS , FSC HOMEO
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=15|53404 19/03/2023|MUHAMMAD AJMAL MALIK|MAHMOOD AHMED MALIK|VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|26/2/1961|MATRIC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=MUHAMMAD AJMAL MALIK||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MAHMOOD AHMED MALIK
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions