Experiment with using pdftotext.

This commit is contained in:
Andrew Ferrier 2015-02-10 15:58:10 +00:00
parent 0da30113f3
commit d43781f247
2 changed files with 7 additions and 12 deletions

View file

@ -30,4 +30,5 @@ COPY . /tmp/email2pdf/
COPY docker/email2pdf/getmail /etc/cron.d/
WORKDIR /tmp/email2pdf
RUN make builddeb_real && sh -c 'ls -1 /tmp/email2pdf/*.deb | xargs -L 1 gdebi -n' && cp /tmp/email2pdf/*.deb /tmp
RUN apt-get install -y xpdf-utils
RUN apt-get clean && rm -rf /var/lib/apt/lists/* /var/tmp/*

View file

@ -369,18 +369,12 @@ class Email2PDFTestCase(unittest.TestCase):
return None
def getPDFText(self, filename):
try:
with io.StringIO() as retstr:
with open(filename, 'rb') as filehandle:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
pagenos = set()
process_pdf(rsrcmgr, device, filehandle, pagenos, maxpages=0, password="", caching=True, check_extractable=True)
device.close()
string = retstr.getvalue()
return string
except PSException:
return None
with tempfile.NamedTemporaryFile() as temporaryFile:
options = ['pdftotext', filename, temporaryFile.name]
p = Popen(options)
p.wait()
with open(temporaryFile.name, 'rb') as f:
return str(f.read(), 'utf-8')
def touch(self, fname):
open(fname, 'w').close()