I have a large pdf where I need to pull all the instances of ID#. I currently have it setup where I can parse the first page correctly but ignores ever other section in the document. Also I have noticed is that there is an issue because not all pages have the ID on it so I would need to ignore the pages that do not contain the ID.
raw:
Case ID#: 45325234523 Date: 12/09/2019
----------------Page (0) Break---------------- ----------------Page (1) Break---------------- ----------------Page (2) Break---------------- ----------------Page (3) Break---------------- � Case ID#: 2424123421 Date: 12/09/2019 ----------------Page (4) Break---------------- ----------------Page (5) Break---------------- ----------------Page (6) Break---------------- ----------------Page (7) Break---------------- 3 Case ID #: 303022123 Date: 12/09/2019 ----------------Page (8) Break---------------- ----------------Page (9) Break---------------- ----------------Page (10) Break---------------- ----------------Page (11) Break---------------- � 4 Case ID#: 3721629345 Date: 12/09/2019 ----------------Page (12) Break---------------- ----------------Page (13) Break---------------- ----------------Page (14) Break---------------- ----------------Page (15) Break----------------
output: [ {Case: '45325234523'} ]
code:
const fs = require("fs");
const PDFParser = require("pdf2json");
// Get all the filenames from the Test folder
const files = fs.readdirSync("Test");
// All of the parse Test
let Test = [];
// Make a IIFE so we can run asynchronous code
(async () => {
// Await all of the Test to be passed
// For each file in the Test folder
await Promise.all(files.map(async (file) => {
// Set up the pdf parser
let pdfParser = new PDFParser(this, 1);
// Load the pdf document
pdfParser.loadPDF(`Test/${file}`);
// Parsed the patient
let output = await new Promise(async (resolve, reject) => {
// On data ready
pdfParser.on("pdfParser_dataReady", (pdfData) => {
// The raw PDF data in text form
const raw = pdfParser.getRawTextContent().replace(/\r\n/g, "");
//console.log(raw);
// Return the parsed data
resolve({
Case: /ID#:\s(.*?)Date/i.exec(raw)[1].trim()
});
});
});
// Add the patient to the Test array
Test.push(output);
}));
// Save the extracted information to a json file
//fs.writeFileSync("Test.json", JSON.stringify(Test));
console.log(Test);
})();