PDF reading with python
This commit is contained in:
3
python/.idea/.gitignore
generated
vendored
Normal file
3
python/.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
6
python/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
python/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
7
python/.idea/misc.xml
generated
Normal file
7
python/.idea/misc.xml
generated
Normal file
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (contabilidad)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
</project>
|
8
python/.idea/modules.xml
generated
Normal file
8
python/.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/python.iml" filepath="$PROJECT_DIR$/.idea/python.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
8
python/.idea/python.iml
generated
Normal file
8
python/.idea/python.iml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (contabilidad)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
6
python/.idea/vcs.xml
generated
Normal file
6
python/.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
18
python/Dockerfile
Normal file
18
python/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
||||
FROM python
|
||||
|
||||
RUN apt-get update -y && apt-get install -y default-jre
|
||||
|
||||
RUN pip install flask tabula-py pyyaml pypdf4 gunicorn
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY ./src/ /app/src/
|
||||
|
||||
#ENTRYPOINT ["/bin/bash"]
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
WORKDIR /app/src
|
||||
|
||||
CMD ["python", "app.py"]
|
||||
#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"]
|
3
python/config/.passwords.yml
Normal file
3
python/config/.passwords.yml
Normal file
@ -0,0 +1,3 @@
|
||||
passwords:
|
||||
- 0839
|
||||
- 159608395
|
BIN
python/data/BICE-CC-2021-09.pdf
Normal file
BIN
python/data/BICE-CC-2021-09.pdf
Normal file
Binary file not shown.
3
python/src/.idea/.gitignore
generated
vendored
Normal file
3
python/src/.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
12
python/src/.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
12
python/src/.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@ -0,0 +1,12 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="property.tolist" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
6
python/src/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
python/src/.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
python/src/.idea/misc.xml
generated
Normal file
4
python/src/.idea/misc.xml
generated
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (python)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
python/src/.idea/modules.xml
generated
Normal file
8
python/src/.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/src.iml" filepath="$PROJECT_DIR$/.idea/src.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
8
python/src/.idea/src.iml
generated
Normal file
8
python/src/.idea/src.iml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.9 (python)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
6
python/src/.idea/vcs.xml
generated
Normal file
6
python/src/.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
BIN
python/src/__pycache__/app.cpython-310.pyc
Normal file
BIN
python/src/__pycache__/app.cpython-310.pyc
Normal file
Binary file not shown.
34
python/src/app.py
Normal file
34
python/src/app.py
Normal file
@ -0,0 +1,34 @@
|
||||
import json
|
||||
import os
|
||||
from flask import Flask, request
|
||||
|
||||
import contabilidad.pdf as pdf
|
||||
import contabilidad.passwords as passwords
|
||||
import contabilidad.log as log
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
log.logging['filename'] = '/var/log/python/contabilidad.log'
|
||||
|
||||
|
||||
@app.route('/pdf/parse', methods=['POST'])
|
||||
def pdf_parse():
|
||||
data = request.get_json()
|
||||
if not isinstance(data['files'], list):
|
||||
data['files'] = [data['files']]
|
||||
password_file = '/app/config/.passwords.yml'
|
||||
pwds = passwords.get_passwords(password_file)
|
||||
texts = []
|
||||
for file in data['files']:
|
||||
filename = os.path.realpath(os.path.join('/app/data', file['filename']))
|
||||
for p in pwds:
|
||||
obj = pdf.get_text(filename, p)
|
||||
if obj is None:
|
||||
continue
|
||||
print(obj)
|
||||
texts.append(json.dumps(obj))
|
||||
return json.dumps(texts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0')
|
BIN
python/src/contabilidad/__pycache__/log.cpython-310.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/log.cpython-310.pyc
Normal file
Binary file not shown.
BIN
python/src/contabilidad/__pycache__/log.cpython-39.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/log.cpython-39.pyc
Normal file
Binary file not shown.
BIN
python/src/contabilidad/__pycache__/passwords.cpython-310.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/passwords.cpython-310.pyc
Normal file
Binary file not shown.
BIN
python/src/contabilidad/__pycache__/passwords.cpython-39.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/passwords.cpython-39.pyc
Normal file
Binary file not shown.
BIN
python/src/contabilidad/__pycache__/pdf.cpython-310.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/pdf.cpython-310.pyc
Normal file
Binary file not shown.
BIN
python/src/contabilidad/__pycache__/pdf.cpython-39.pyc
Normal file
BIN
python/src/contabilidad/__pycache__/pdf.cpython-39.pyc
Normal file
Binary file not shown.
19
python/src/contabilidad/log.py
Normal file
19
python/src/contabilidad/log.py
Normal file
@ -0,0 +1,19 @@
|
||||
import time
|
||||
|
||||
|
||||
logging = {
|
||||
'filename': '/var/log/python/error.log'
|
||||
}
|
||||
|
||||
|
||||
class LOG_LEVEL:
|
||||
INFO = 'INFO'
|
||||
WARNING = 'WARNING'
|
||||
DEBUG = 'DEBUG'
|
||||
ERROR = 'ERROR'
|
||||
|
||||
|
||||
def log(message, level=LOG_LEVEL.INFO):
|
||||
filename = logging['filename']
|
||||
with open(filename, 'a') as f:
|
||||
f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message)
|
6
python/src/contabilidad/passwords.py
Normal file
6
python/src/contabilidad/passwords.py
Normal file
@ -0,0 +1,6 @@
|
||||
import yaml
|
||||
|
||||
|
||||
def get_passwords(filename):
|
||||
with open(filename, 'r') as f:
|
||||
return yaml.load(f, Loader=yaml.Loader)['passwords']
|
31
python/src/contabilidad/pdf.py
Normal file
31
python/src/contabilidad/pdf.py
Normal file
@ -0,0 +1,31 @@
|
||||
import PyPDF4
|
||||
import tabula
|
||||
|
||||
|
||||
def get_pdf(file, password=''):
|
||||
reader = PyPDF4.PdfFileReader(file)
|
||||
if reader.getIsEncrypted() and password != '':
|
||||
status = reader.decrypt(password=password)
|
||||
if status == 0:
|
||||
return None
|
||||
return reader
|
||||
|
||||
|
||||
def get_text(filename, password=''):
|
||||
with open(filename, 'rb') as f:
|
||||
reader = get_pdf(f, password)
|
||||
if reader is None:
|
||||
return None
|
||||
print(reader.getPage(0).extractText())
|
||||
texts = []
|
||||
for p in range(0, reader.getNumPages()):
|
||||
print(p)
|
||||
texts.append(reader.getPage(p).extractText())
|
||||
return "\n".join(texts)
|
||||
|
||||
|
||||
def get_data(filename, password=''):
|
||||
if password == '':
|
||||
return tabula.read_pdf(filename, pages='all', output_format='json')
|
||||
else:
|
||||
return tabula.read_pdf(filename, password=password, pages='all', output_format='json')
|
54
python/src/contabilidad/send_pdf.py
Normal file
54
python/src/contabilidad/send_pdf.py
Normal file
@ -0,0 +1,54 @@
|
||||
import argparse
|
||||
import yaml
|
||||
import PyPDF4
|
||||
import httpx
|
||||
|
||||
|
||||
def get_pdf(file, password=''):
|
||||
reader = PyPDF4.PdfFileReader(file)
|
||||
if password != '':
|
||||
status = reader.decrypt(password=password)
|
||||
if status == 0:
|
||||
print('Not decrypted')
|
||||
return reader
|
||||
|
||||
|
||||
def send_to_parser(url, text):
|
||||
res = httpx.post(url, data={'to_parse': text})
|
||||
return {'status': res.status_code, 'text': res.json()}
|
||||
|
||||
|
||||
def get_text(filename, password=''):
|
||||
with open(filename, 'rb') as f:
|
||||
reader = get_pdf(f, password)
|
||||
texts = []
|
||||
for p in range(0, reader.getNumPages()):
|
||||
texts.append(reader.getPage(p).extractText())
|
||||
return "\n".join(texts)
|
||||
|
||||
|
||||
def get_config(filename):
|
||||
with open(filename, 'r') as f:
|
||||
return yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
|
||||
def main(args):
|
||||
password = ''
|
||||
if args.config_file is not None:
|
||||
config = get_config(args.config_file)
|
||||
password = config['password']
|
||||
if args.password is not None:
|
||||
password = args.password
|
||||
text = get_text(args.filename, password)
|
||||
res = send_to_parser(args.url, text)
|
||||
print(res)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f', '--filename', type=str)
|
||||
parser.add_argument('-p', '--password', type=str)
|
||||
parser.add_argument('-c', '--config_file', type=str)
|
||||
parser.add_argument('-u', '--url', type=str)
|
||||
_args = parser.parse_args()
|
||||
main(_args)
|
3
python/src/contabilidad/text_handler.py
Normal file
3
python/src/contabilidad/text_handler.py
Normal file
@ -0,0 +1,3 @@
|
||||
def text_cleanup(text):
|
||||
lines = text.split("\n")
|
||||
print(lines)
|
18
python/src/main.py
Normal file
18
python/src/main.py
Normal file
@ -0,0 +1,18 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import contabilidad.pdf as pdf
|
||||
|
||||
|
||||
def main(args):
|
||||
filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename))
|
||||
obj = pdf.get_text(filename, args.password)
|
||||
print(obj)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f', '--filename', type=str)
|
||||
parser.add_argument('-p', '--password', type=str, default='')
|
||||
_args = parser.parse_args()
|
||||
main(_args)
|
35
python/src/tests/Testing.md
Normal file
35
python/src/tests/Testing.md
Normal file
@ -0,0 +1,35 @@
|
||||
# Tests
|
||||
|
||||
### 1. Conductor
|
||||
+ Set start event
|
||||
+ Get ready events
|
||||
+ Set first step event
|
||||
+ Get first step ready
|
||||
+ Set second step event
|
||||
+ Get second step ready
|
||||
|
||||
### 2. Email
|
||||
+ Connect to IMAP
|
||||
+ Wrong data
|
||||
+ Wrong configuration
|
||||
+ Get mailboxes
|
||||
+ Get mail ids with search
|
||||
+ Get mails by id
|
||||
+ Get mail by id
|
||||
+ Get attachment
|
||||
+ Close connection
|
||||
|
||||
### 3. API Sender
|
||||
+ Get attachments
|
||||
+ Process
|
||||
+ Send to API
|
||||
|
||||
|
||||
## Steps
|
||||
1. Start
|
||||
+ Connect
|
||||
+ Standby
|
||||
2. Find emails, get attachments
|
||||
3. Process attachments
|
||||
4. Send to API
|
||||
5. Close
|
Reference in New Issue
Block a user