PDF reading with python

This commit is contained in:
2021-11-01 11:00:59 -03:00
parent 9f301e2175
commit 5ee267568a
74 changed files with 1092 additions and 26 deletions

3
python/.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
python/.idea/misc.xml generated Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (contabilidad)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project>

8
python/.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/python.iml" filepath="$PROJECT_DIR$/.idea/python.iml" />
</modules>
</component>
</project>

8
python/.idea/python.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (contabilidad)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
python/.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>

18
python/Dockerfile Normal file
View File

@ -0,0 +1,18 @@
FROM python
RUN apt-get update -y && apt-get install -y default-jre
RUN pip install flask tabula-py pyyaml pypdf4 gunicorn
WORKDIR /app
COPY ./src/ /app/src/
#ENTRYPOINT ["/bin/bash"]
EXPOSE 5000
WORKDIR /app/src
CMD ["python", "app.py"]
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]

View File

@ -0,0 +1,3 @@
passwords:
- 0839
- 159608395

Binary file not shown.

3
python/src/.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,12 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="property.tolist" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
python/src/.idea/misc.xml generated Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (python)" project-jdk-type="Python SDK" />
</project>

8
python/src/.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/src.iml" filepath="$PROJECT_DIR$/.idea/src.iml" />
</modules>
</component>
</project>

8
python/src/.idea/src.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
python/src/.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>

Binary file not shown.

34
python/src/app.py Normal file
View File

@ -0,0 +1,34 @@
import json
import os
from flask import Flask, request
import contabilidad.pdf as pdf
import contabilidad.passwords as passwords
import contabilidad.log as log
app = Flask(__name__)
log.logging['filename'] = '/var/log/python/contabilidad.log'
@app.route('/pdf/parse', methods=['POST'])
def pdf_parse():
data = request.get_json()
if not isinstance(data['files'], list):
data['files'] = [data['files']]
password_file = '/app/config/.passwords.yml'
pwds = passwords.get_passwords(password_file)
texts = []
for file in data['files']:
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
for p in pwds:
obj = pdf.get_text(filename, p)
if obj is None:
continue
print(obj)
texts.append(json.dumps(obj))
return json.dumps(texts)
if __name__ == '__main__':
app.run(host='0.0.0.0')

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,19 @@
import time
logging = {
'filename': '/var/log/python/error.log'
}
class LOG_LEVEL:
INFO = 'INFO'
WARNING = 'WARNING'
DEBUG = 'DEBUG'
ERROR = 'ERROR'
def log(message, level=LOG_LEVEL.INFO):
filename = logging['filename']
with open(filename, 'a') as f:
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message)

View File

@ -0,0 +1,6 @@
import yaml
def get_passwords(filename):
with open(filename, 'r') as f:
return yaml.load(f, Loader=yaml.Loader)['passwords']

View File

@ -0,0 +1,31 @@
import PyPDF4
import tabula
def get_pdf(file, password=''):
reader = PyPDF4.PdfFileReader(file)
if reader.getIsEncrypted() and password != '':
status = reader.decrypt(password=password)
if status == 0:
return None
return reader
def get_text(filename, password=''):
with open(filename, 'rb') as f:
reader = get_pdf(f, password)
if reader is None:
return None
print(reader.getPage(0).extractText())
texts = []
for p in range(0, reader.getNumPages()):
print(p)
texts.append(reader.getPage(p).extractText())
return "\n".join(texts)
def get_data(filename, password=''):
if password == '':
return tabula.read_pdf(filename, pages='all', output_format='json')
else:
return tabula.read_pdf(filename, password=password, pages='all', output_format='json')

View File

@ -0,0 +1,54 @@
import argparse
import yaml
import PyPDF4
import httpx
def get_pdf(file, password=''):
reader = PyPDF4.PdfFileReader(file)
if password != '':
status = reader.decrypt(password=password)
if status == 0:
print('Not decrypted')
return reader
def send_to_parser(url, text):
res = httpx.post(url, data={'to_parse': text})
return {'status': res.status_code, 'text': res.json()}
def get_text(filename, password=''):
with open(filename, 'rb') as f:
reader = get_pdf(f, password)
texts = []
for p in range(0, reader.getNumPages()):
texts.append(reader.getPage(p).extractText())
return "\n".join(texts)
def get_config(filename):
with open(filename, 'r') as f:
return yaml.load(f, Loader=yaml.Loader)
def main(args):
password = ''
if args.config_file is not None:
config = get_config(args.config_file)
password = config['password']
if args.password is not None:
password = args.password
text = get_text(args.filename, password)
res = send_to_parser(args.url, text)
print(res)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str)
parser.add_argument('-p', '--password', type=str)
parser.add_argument('-c', '--config_file', type=str)
parser.add_argument('-u', '--url', type=str)
_args = parser.parse_args()
main(_args)

View File

@ -0,0 +1,3 @@
def text_cleanup(text):
lines = text.split("\n")
print(lines)

18
python/src/main.py Normal file
View File

@ -0,0 +1,18 @@
import argparse
import os
import contabilidad.pdf as pdf
def main(args):
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
obj = pdf.get_text(filename, args.password)
print(obj)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str)
parser.add_argument('-p', '--password', type=str, default='')
_args = parser.parse_args()
main(_args)

View File

@ -0,0 +1,35 @@
# Tests
### 1. Conductor
+ Set start event
+ Get ready events
+ Set first step event
+ Get first step ready
+ Set second step event
+ Get second step ready
### 2. Email
+ Connect to IMAP
+ Wrong data
+ Wrong configuration
+ Get mailboxes
+ Get mail ids with search
+ Get mails by id
+ Get mail by id
+ Get attachment
+ Close connection
### 3. API Sender
+ Get attachments
+ Process
+ Send to API
## Steps
1. Start
+ Connect
+ Standby
2. Find emails, get attachments
3. Process attachments
4. Send to API
5. Close