diff --git a/python/Dockerfile b/python/Dockerfile index 1eedf44..96ee6c3 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -2,7 +2,7 @@ FROM python RUN apt-get update -y && apt-get install -y ghostscript python3-tk libgl-dev -RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf +RUN pip install flask pyyaml pypdf4 gunicorn camelot-py[cv] pikepdf httpx WORKDIR /app @@ -12,4 +12,5 @@ EXPOSE 5000 WORKDIR /app/src -CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] +CMD ["python", "app.py"] +#CMD ["gunicorn", "-b 0.0.0.0:5000", "app:app"] diff --git a/python/config/.passwords.yml b/python/config/.passwords.yml index f44e275..1acd267 100644 --- a/python/config/.passwords.yml +++ b/python/config/.passwords.yml @@ -1,3 +1,4 @@ passwords: - 0839 - 159608395 + - 15960839 diff --git a/python/data/EECCvirtual-Visa.pdf b/python/data/EECCvirtual-Visa.pdf new file mode 100644 index 0000000..7ea5ef8 Binary files /dev/null and b/python/data/EECCvirtual-Visa.pdf differ diff --git a/python/src/ai/dictionary.py b/python/src/ai/dictionary.py new file mode 100644 index 0000000..c42caef --- /dev/null +++ b/python/src/ai/dictionary.py @@ -0,0 +1,285 @@ +import json +import os + +import numpy as np +import sklearn +import enlighten +from sklearn.preprocessing import LabelEncoder + +import src.contabilidad.pdf as pdf +import src.contabilidad.text_handler as th +from src.ai.models import Phrase, phrase_factory, Word, word_factory +from src.contabilidad.log import LOG_LEVEL + + +class Dictionary: + def __init__(self, filename, logger): + self.filename = filename + self._logger = logger + self.__processed = [] + self.__phrases = None + self.__words = None + self.load() + + def load(self): + if not os.path.isfile(self.filename): + return + with open(self.filename, 'r') as file: + data = json.load(file) + if 'words' in data.keys(): + self.__words = [] + [self.__words.append(word_factory(w)) for w in data['words']] + if 'phrases' in data.keys(): + self.__phrases = [] + [self.__phrases.append(phrase_factory(ph)) for ph in data['phrases']] + if 'processed' in data.keys(): + self.__processed = [] + self.__processed = data['processed'] + + def save(self): + self.sort_words() + self.sort_phrases() + with open(self.filename, 'w') as file: + json.dump(self.to_json(), file, indent=2) + + def to_data(self): + encoder = LabelEncoder() + data = encoder.fit_transform([w.get_word() for w in self.get_words()]) + [self.__words[i].set_fit(f) for i, f in enumerate(data)] + print(data) + # return [ph.to_data() for ph in self.get_phrases()] + + def to_json(self): + output = { + 'processed': [], + 'words': [], + 'phrases': [] + } + if self.__processed is not None and len(self.__processed) > 0: + output['processed'] = self.__processed + if self.__words is not None and len(self.__words) > 0: + output['words'] = [w.to_json() for w in self.__words] + if self.__phrases is not None and len(self.__phrases) > 0: + output['phrases'] = [p.to_json() for p in self.__phrases] + return output + + def find_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None): + if not self.__phrases: + return -1 + if phrase is not None: + phrase_list = [w.get_word() for w in phrase.get_words()] + elif phrase_dict is not None: + phrase_list = phrase_dict['words'] + elif phrase_list is not None: + pass + else: + return -1 + return find_phrase(self.__phrases, phrase_list) + + def add_phrase(self, phrase: Phrase = None, phrase_dict: dict = None, phrase_list: list = None): + if self.__phrases is None: + self.__phrases = [] + if phrase is not None: + pass + elif phrase_dict is not None: + phrase = phrase_factory(phrase_dict) + elif phrase_list is not None: + phrase = phrase_factory({'words': phrase_list}) + else: + return self + i = self.find_phrase(phrase) + if i > -1: + self.__phrases[i].add_freq() + return self + self.__phrases.append(phrase) + return self + + def add_phrases(self, phrase_list: list): + if self.__phrases is None: + self.__phrases = [] + phs = [sorted(w.get_word() for w in p) for p in self.__phrases] + with enlighten.get_manager() as manager: + with manager.counter(total=len(phrase_list), desc='Phrases', unit='phrases', color='green') as bar1: + for i, phrase in enumerate(phrase_list): + # print(f'Adding phrase {i}.') + p2 = sorted([w.get_word() for w in phrase]) + if p2 in phs: + k = phs.index(p2) + self.__phrases[k].add_freq() + continue + ph = phrase_factory({'words': phrase}) + self.__phrases.append(ph) + phs.append(p2) + bar1.update() + + def get_phrases(self): + return self.__phrases + + def sort_phrases(self): + if self.__phrases is None: + return + try: + def sort_phrase(p): + if p is None: + return 0 + if isinstance(p, Phrase): + return p.get_freq(), p.get_type().get_desc(), len(p.get_words()) + return p['frequency'], p['type']['description'], len(p['words']) + self.__phrases = sorted(self.__phrases, + key=sort_phrase) + except Exception as e: + self._logger.log(repr(self.__phrases), LOG_LEVEL.ERROR) + self._logger.log(e) + return self + + def sort_words(self): + if self.__words is None: + return + try: + def sort_word(w): + if w is None: + return 0 + if isinstance(w, Word): + return w.get_freq(), w.get_type().get_desc(), w.get_word() + return w['frequency'], w['type']['description'], w['word'] + self.__words = sorted(self.__words, key=sort_word, reverse=True) + except Exception as e: + self._logger.log(repr(self.__words)) + self._logger.log(e) + return self + + def find_word(self, word: Word = None, word_dict: dict = None, word_str: str = None): + if not self.__words: + return -1 + if word is not None: + word_str = word.get_word() + elif word_dict is not None: + word_str = word_dict['word'] + elif word_str is not None: + pass + else: + return -1 + + return find_word(self.__words, word_str) + + def add_word(self, word: Word = None, word_dict: dict = None, word_str: str = None): + if self.__words is None: + self.__words = [] + if word is not None: + pass + elif word_dict is not None: + word = word_factory(word_dict) + elif word_str is not None: + word = word_factory({'word': word_str}) + else: + return self + i = self.find_word(word) + if i > -1: + self.__words[i].add_freq() + return self + self.__words.append(word) + return self + + def add_words(self, words: list): + [self.add_word(word=w) for w in words if isinstance(w, Word)] + [self.add_word(word_dict=w) for w in words if isinstance(w, dict)] + [self.add_word(word_str=w) for w in words if isinstance(w, str)] + return self + + def get_words(self): + return filter_unique_words(self.__words) + + def match_words(self, word_list: list): + new_list = [] + for w in word_list: + wi = self.find_word(word_str=w) + new_list.append(self.__words[wi]) + return new_list + + def append_to_phrase(self, seed: list = None, length: int = 1): + if seed is None: + return [self.__words[0]] + max_index = max(seed) + length + if max_index > len(self.__words): + if length == 1: + return False + return self.append_to_phrase(seed, length - 1) + return seed + self.__words[max_index] + + def get_possible_phrases(self, word_list): + print('Adding words.') + self.add_words(word_list) + + print('Creating phrases.') + with enlighten.get_manager() as manager: + with manager.counter(total=len(word_list)**2, desc='Phrases', unit='words', color='red') as bar1: + phrases = [] + for length in range(1, len(word_list) + 1): + bar2 = bar1.add_subcounter(color='green') + for start in range(0, len(word_list)): + phrase = build_phrase(word_list, start, start + length) + phrase = self.match_words(phrase) + phrases.append(phrase) + start += length + bar2.update() + bar1.update() + + print(f'Created {len(phrases)} phrases.') + phrases = sorted(phrases, key=lambda e: len(e)) + + print('Adding phrases.') + # Really slow (~115000 phrases in one pdf) + self.add_phrases(phrases) + return self.__phrases + + def is_processed(self, filename: str): + return os.path.basename(filename) in self.__processed + + def process(self, filename: str, password: str = None): + if self.is_processed(filename): + print('Already processed.') + return + t = filename.split('.') + temp = os.path.realpath(os.path.join(os.path.dirname(filename), t[0] + '-temp.pdf')) + print('Removing PDF encryption.') + pdf.remove_encryption(filename, password, temp) + print('Getting text') + obj = pdf.get_text(temp) + os.remove(temp) + print('Getting possible phrases.') + phrases = self.get_possible_phrases(th.split_words(obj)) + self.__processed.append(os.path.basename(filename)) + return phrases + + +def build_phrase(word_list, start: int, end: int = None): + if end is None: + return word_list[start:] + return word_list[start:end] + + +def filter_unique_words(words): + new_list = [] + for w in words: + if w not in new_list: + new_list.append(w) + return new_list + + +def validate_phrase(phrase): + return True + + +def find_phrase(phrases: list, phrase: list): + phrase_list = [sorted([w.get_word() for w in p.get_words()]) for p in phrases] + sphrase = sorted(phrase) + if sphrase in phrase_list: + return phrase_list.index(sphrase) + return -1 + + +def find_word(words: list, word: str): + word_list = [w.get_word() for w in words] + if word in word_list: + return word_list.index(word) + return -1 diff --git a/python/src/ai/models.py b/python/src/ai/models.py new file mode 100644 index 0000000..184a0ba --- /dev/null +++ b/python/src/ai/models.py @@ -0,0 +1,243 @@ +import json + + +class Type: + def __init__(self, _id, _description): + self.__id = _id + self.__description = _description + + def get_id(self): + return self.__id + + def get_desc(self): + return self.__description + + def to_json(self): + return self.get_id() + + def __repr__(self): + return json.dumps({ + 'id': self.get_id(), + 'description': self.get_desc() + }) + + +def type_factory(_type: str, _id: int): + if _type == 'Word' or _type == 'WordType': + t = WordType() + elif _type == 'Phrase' or _type == 'PhraseType': + t = PhraseType() + else: + return None + t.load(_id) + return t + + +class WordType(Type): + STRING = 0 + NUMERIC = 1 + CURRENCY = 2 + DATE = 4 + + def __init__(self): + super().__init__(0, 'string') + + def load(self, word_type: int): + if word_type == self.STRING: + self.__description = 'string' + elif word_type == self.NUMERIC: + self.__description = 'numeric' + elif word_type == self.CURRENCY: + self.__description = 'currency' + elif word_type == self.DATE: + self.__description = 'date' + return self + + +class PhraseType(Type): + TEXT = 0 + TITLE = 1 + HEADER = 2 + MOVEMENT = 4 + INVALID = 99 + + def __init__(self): + super(PhraseType, self).__init__(0, 'text') + + def load(self, phrase_type: int): + if phrase_type == self.TEXT: + self.__description = 'text' + elif phrase_type == self.TITLE: + self.__description = 'title' + elif phrase_type == self.HEADER: + self.__description = 'header' + + +class Word: + def __init__(self): + self.__id = 0 + self.__word = None + self.__type_id = 0 + self.__type = None + self.__frequency = 1 + + def set_id(self, idx: int): + self.__id = idx + return self + + def set_word(self, word: str): + self.__word = word + return self + + def set_type(self, word_type): + if isinstance(word_type, WordType): + self.__type_id = word_type.get_id() + # self.__type = word_type + if isinstance(word_type, int): + self.__type_id = word_type + # self.__type = type_factory('Word', word_type) + return self + + def add_freq(self, amount: int = 1): + self.__frequency += amount + return self + + def get_id(self) -> int: + return self.__id + + def get_word(self) -> str: + return self.__word + + def get_type_id(self) -> int: + return self.__type_id + + def get_type(self) -> WordType: + if self.__type is None: + self.__type = type_factory('Word', self.__type_id) + return self.__type + + def get_freq(self) -> int: + return self.__frequency + + def to_json(self) -> dict: + output = { + 'id': self.get_id(), + 'word': self.get_word(), + 'type': self.get_type_id(), + 'freq': self.get_freq() + } + return output + + def __repr__(self): + return json.dumps(self.to_json()) + + +def word_factory(word: dict) -> Word: + w = Word() + w.set_id(word['id']) + w.set_word(word['word']) + if 'type' in word: + w.set_type(word['type']) + if 'freq' in word: + w.add_freq(word['freq'] - 1) + return w + + +class Phrase: + def __init__(self): + self.__id = 0 + self.__words = None + self.__type_id = 0 + self.__type = None + self.__frequency = 1 + + def set_id(self, idx: int): + self.__id = idx + return self + + def add_word(self, word): + if isinstance(word, Word): + self.__words.append(word.get_id()) + if isinstance(word, dict): + if 'id' in word: + self.__words.append(word['id']) + if isinstance(word, int): + self.__words.append(word) + return self + + def set_words(self, words: list): + if self.__words is None: + self.__words = [] + for w in words: + if isinstance(w, Word): + self.add_word(w) + if isinstance(w, dict): + self.add_word(w) + if isinstance(w, int): + self.add_word(w) + return self + + def set_type(self, phrase_type): + if isinstance(phrase_type, PhraseType): + self.__type_id = phrase_type.get_id() + # self.__type = phrase_type + if isinstance(phrase_type, int): + self.__type_id = phrase_type + # self.__type = type_factory('Phrase', phrase_type) + return self + + def add_freq(self, amount: int = 1): + self.__frequency += amount + return self + + def get_id(self) -> int: + return self.__id + + def get_words(self) -> list: + return self.__words + + def get_type_id(self) -> int: + return self.__type_id + + def get_type(self) -> PhraseType: + if self.__type is None: + self.__type = type_factory('Phrase', self.__type_id) + return self.__type + + def get_freq(self) -> int: + return self.__frequency + + def match(self, word_list: list): + if len(word_list) != len(self.__words): + return False + new_words = sorted(self.__words) + new_list = sorted(word_list) + if new_words == new_list: + return True + return False + + def to_json(self): + output = { + 'id': self.get_id(), + 'words': self.get_words(), + 'type': self.get_type_id(), + 'freq': self.get_freq() + } + return output + + def __repr__(self): + return json.dumps(self.to_json()) + + def __len__(self): + return len(self.get_words()) + + +def phrase_factory(phrase: dict) -> Phrase: + ph = Phrase() + ph.set_id(phrase['id']) + ph.set_words(phrase['words']) + if 'type' in phrase: + ph.set_type(phrase['type']) + if 'freq' in phrase: + ph.add_freq(phrase['freq'] - 1) + return ph diff --git a/python/src/ai/network.py b/python/src/ai/network.py new file mode 100644 index 0000000..ae0345a --- /dev/null +++ b/python/src/ai/network.py @@ -0,0 +1,126 @@ +import json +import os +import time +import timeit + +import tensorflow as tf +import sklearn +import numpy as np +from sklearn.preprocessing import LabelEncoder + +import src.contabilidad.pdf as pdf +import src.contabilidad.text_handler as th + + +class Layer: + def __init__(self): + self.__weights = None + self.__bias = None + + def set_size(self, inputs: int, size: int): + self.__weights = [[0 for j in range(0, inputs)] for i in range(0, size)] + self.__bias = [0 for i in range(0, size)] + + def add_weight(self, vector: list, idx: int = None): + if idx is None: + self.__weights.append(vector) + return self + self.__weights = self.__weights[:idx] + [vector] + self.__weights[idx:] + return self + + def set_weight(self, value: float, weight_index: int, input_index: int): + self.__weights[weight_index][input_index] = value + + def set_bias(self, value: list): + self.__bias = value + + def train(self, input_values: list, output_values: list): + output = self.get_output(input_values) + errors = [] + for i, v in enumerate(output): + error = (output_values[i] - v) / output_values[i] + new_value = v * error + + def to_json(self): + return { + 'bias': self.__bias, + 'weights': self.__weights + } + + def get_output(self, vector: list): + output = [] + for i, weight in enumerate(self.__weights): + val = 0 + for j, v in enumerate(weight): + val += v * vector[j] + output[i] = val + self.__bias[i] + return output + + +def layer_factory(layer_dict: dict): + layer = Layer() + layer.set_bias(layer_dict['bias']) + [layer.add_weight(w) for w in layer_dict['weights']] + return layer + + +class Network: + def __init__(self, filename: str): + self._filename = filename + self.__layers = None + + def load(self): + with open(self._filename) as f: + data = json.load(f) + if 'layers' in data.keys(): + self.add_layers(data['layers']) + + def add_layers(self, layers: list): + for lr in layers: + layer = layer_factory(lr) + self.__layers.append(layer) + + +class AI: + def __init__(self, dictionary_filename, logger): + self.__dict = None + self.__network = None + self.__sources = None + self._phrases = None + self.filename = '' + + def add_source(self, text): + if self.__sources is None: + self.__sources = [] + self.__sources.append(text) + return self + + def set_filename(self, filename: str): + self.filename = filename + return self + + def process_sources(self): + for source in self.__sources: + self.process(**source) + + def process(self, filename, password): + encoder = LabelEncoder() + t = filename.split('.') + temp = os.path.realpath(os.path.join(os.path.dirname(filename), t[0] + '-temp.pdf')) + pdf.remove_encryption(filename, password, temp) + obj = pdf.get_text(temp) + os.remove(temp) + word_list = th.split_words(obj) + fits = encoder.fit_transform(word_list) + phrases = [] + for length in range(1, len(word_list) + 1): + for start in range(0, len(word_list)): + phrase = word_list[start:(start + length)] + phrase = np.append(np.array([fits[word_list.index(w)] for w in phrase]), + np.zeros([len(word_list) - len(phrase)])) + phrases.append(phrase) + phrases = np.array(phrases) + self._phrases = phrases + + def active_train(self): + pass diff --git a/python/src/app.py b/python/src/app.py index 5722eb2..a372365 100644 --- a/python/src/app.py +++ b/python/src/app.py @@ -1,22 +1,40 @@ -import io import json import os import sys -from flask import Flask, request +import httpx +from flask import Flask, request, jsonify import contabilidad.pdf as pdf import contabilidad.passwords as passwords -import contabilidad.log as log import contabilidad.text_handler as th +from contabilidad.log import Log app = Flask(__name__) -log.logging['filename'] = '/var/log/python/contabilidad.log' +log = Log('/var/log/python/contabilidad.log') +api_key = os.environ.get('PYTHON_KEY') + + +def validate_key(request_obj): + if 'Authorization' in request_obj.headers: + auth = request_obj.headers.get('Authorization') + if isinstance(auth, list): + auth = auth[0] + if 'Bearer' in auth: + auth = auth.split(' ')[1] + return auth == api_key + if 'API_KEY' in request_obj.values: + return request_obj.values.get('API_KEY') == api_key + if 'api_key' in request_obj.values: + return request_obj.values.get('api_key') == api_key + return False @app.route('/pdf/parse', methods=['POST']) def pdf_parse(): + if not validate_key(request): + return jsonify({'message': 'Not Authorized'}) data = request.get_json() if not isinstance(data['files'], list): data['files'] = [data['files']] @@ -32,6 +50,11 @@ def pdf_parse(): continue pdf.remove_encryption(filename, p, temp) obj = pdf.get_data(temp) + try: + text = th.text_cleanup(pdf.get_text(temp)) + except IndexError as ie: + print(ie, file=sys.stderr) + continue outputs = [] for o in obj: out = json.loads(o.df.to_json(orient='records')) @@ -48,8 +71,35 @@ def pdf_parse(): out[i] = line outputs.append(out) os.remove(temp) - output.append({'filename': file['filename'], 'text': outputs}) - return json.dumps(output) + output.append({'bank': text['bank'], 'filename': file['filename'], 'tables': outputs, 'text': text['text']}) + return jsonify(output) + + +@app.route('/cambio/get', methods=['POST']) +def cambios(): + if not validate_key(request): + return jsonify({'message': 'Not Authorized'}) + data = request.get_json() + valid = { + "CLF": "uf", + "IVP": "ivp", + "USD": "dolar", + "USDo": "dolar_intercambio", + "EUR": "euro", + "IPC": "ipc", + "UTM": "utm", + "IMACEC": "imacec", + "TPM": "tpm", + "CUP": "libra_cobre", + "TZD": "tasa_desempleo", + "BTC": "bitcoin" + } + base_url = 'https://mindicador.cl/api/' + url = f"{base_url}{valid[data['desde']]}/{'-'.join(list(reversed(data['fecha'].split('-'))))}" + res = httpx.get(url) + if res.status_code != httpx.codes.OK: + return jsonify({'error': 'Valor no encontrado.'}) + return jsonify(res.json()) if __name__ == '__main__': diff --git a/python/src/contabilidad/__pycache__/log.cpython-39.pyc b/python/src/contabilidad/__pycache__/log.cpython-39.pyc deleted file mode 100644 index 36d64f1..0000000 Binary files a/python/src/contabilidad/__pycache__/log.cpython-39.pyc and /dev/null differ diff --git a/python/src/contabilidad/log.py b/python/src/contabilidad/log.py index c16024d..a1d908b 100644 --- a/python/src/contabilidad/log.py +++ b/python/src/contabilidad/log.py @@ -1,19 +1,65 @@ +import os.path import time - - -logging = { - 'filename': '/var/log/python/error.log' -} +import traceback class LOG_LEVEL: - INFO = 'INFO' - WARNING = 'WARNING' - DEBUG = 'DEBUG' - ERROR = 'ERROR' + INFO = 0 + WARNING = 1 + DEBUG = 2 + ERROR = 4 + + @staticmethod + def desc(level): + mapping = { + LOG_LEVEL.INFO: 'INFO', + LOG_LEVEL.WARNING: 'WARNING', + LOG_LEVEL.DEBUG: 'DEBUG', + LOG_LEVEL.ERROR: 'ERROR' + } + return mapping[level] -def log(message, level=LOG_LEVEL.INFO): - filename = logging['filename'] - with open(filename, 'a') as f: - f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + level + ': ' + message) +class Logger: + def __init__(self): + self._logs = [] + + def add_log(self, filename: str, min_level: int = LOG_LEVEL.INFO): + self._logs.append({'log': Log(filename), 'level': min_level}) + self._logs.sort(key=lambda e: e['level']) + return self + + def log(self, message, level: int = LOG_LEVEL.INFO): + for log in self._logs: + if log['level'] >= level: + log['log'].log(message, level) + + +class Log: + MAX_SIZE = 10 * 1024 * 1024 + + def __init__(self, filename: str = '/var/log/python/error.log'): + self._filename = filename + + def log(self, message, level: int = LOG_LEVEL.INFO): + if isinstance(message, Exception): + message = traceback.format_exc() + if level < LOG_LEVEL.ERROR: + level = LOG_LEVEL.ERROR + self.rotate_file() + with open(self._filename, 'a') as f: + f.write(time.strftime('[%Y-%m-%d %H:%M:%S] ') + ' - ' + LOG_LEVEL.desc(level=level) + ': ' + message + "\n") + + def rotate_file(self): + if not os.path.isfile(self._filename): + return + file_size = os.path.getsize(self._filename) + if file_size > self.MAX_SIZE: + self.next_file() + + def next_file(self): + name = self._filename.split('.') + n = 1 + if name[-2].isnumeric(): + n = int(name[-2]) + 1 + self._filename = '.'.join([name[0], str(n), name[-1]]) diff --git a/python/src/contabilidad/text_handler.py b/python/src/contabilidad/text_handler.py index 27690ad..6d5240c 100644 --- a/python/src/contabilidad/text_handler.py +++ b/python/src/contabilidad/text_handler.py @@ -1,48 +1,112 @@ -def text_cleanup(text, filename: str = None): +def text_cleanup(text: str): if isinstance(text, list): - output = [] - for t in text: - output.append(text_cleanup(t, filename=filename)) - return output - if filename is None: - return text - if 'bice' in filename.lower(): - return bice(text) - if 'scotiabank' in filename.lower(): - return scotiabank(text) - return text + text = "\n\n\n".join(text) + if 'bice' in text.lower(): + return {'bank': 'BICE', 'text': bice(text)} + if 'scotiabank' in text.lower(): + return {'bank': 'Scotiabank', 'text': scotiabank(text)} + if 'TARJETA' in text: + return {'bank': 'Scotiabank', 'text': tarjeta(text)} + return {'bank': 'unknown', 'text': basic(text)} def bice(text): - lines = text.split("\n\n\n") - print(lines) - return text + lines = [t2.strip() for t in text.split("\n\n\n") + for t1 in t.split("\n\n") for t2 in t1.split("\n") if t2.strip() != ''] + output = [] + output += extract_from_to(lines, 'NOMBRE DEL CLIENTE', end='LAS CONDES', line_length=3) + ti = [t for t in lines if 'MOVIMIENTOS DE LA CUENTA CORRIENTE' in t][0] + output += extract_from_to(lines, 'LAS CONDES', end=ti, line_length=3) + output += [ti] + ti = [i for i, t in enumerate(lines) if 'FECHA' in t] + output += extract_from_to(lines, ti[0], end=ti[1], line_length=4) + output += extract_from_to(lines, 'RESUMEN DEL PERIODO', end='SALDO INICIAL', line_length=1) + output += extract_from_to(lines, 'SALDO INICIAL', end='LINEA SOBREGIRO AUTORIZADA', line_length=4) + output += extract_from_to(lines, 'LINEA SOBREGIRO AUTORIZADA', end='OBSERVACIONES', line_length=3) + output += extract_from_to(lines, 'OBSERVACIONES', line_length=1) + return output def scotiabank(text): - words = text.split("\n") + words = split_words(text) output = [words[0]] - output = output + extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3) - output = output + extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO', - end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2) - output = output + extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR', - line_length=1) - output = output + extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4) - output = output + extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6, - merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']]) - [print(li) for li in output] - return text + output += extract_from_to(words, 'No. CTA.', end='VENCIMIENTO LINEA DE CREDITO', line_length=3) + output += extract_from_to(words, 'VENCIMIENTO LINEA DE CREDITO', + end='NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', line_length=2) + output += extract_from_to(words, 'NOMBRE EJECUTIVO: LILIAN AVILA MANRIQUEZ', end='SALDO ANTERIOR', + line_length=1) + output += extract_from_to(words, 'SALDO ANTERIOR', end='FECHA', line_length=4) + output += extract_data(words, 'FECHA', end='ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', line_length=6, + merge_list=[['DOCTO', 'No.'], ['SALDO', 'DIARIO']]) + output += extract_from_to(words, 'ACTUALICE SIEMPRE ANTECEDENTES LEGALES, ', 1) + return output -def extract_from_to(word_list, start, line_length, end: str = None, merge_list=None): +def tarjeta(text): + words = split_words(text) + output = ['ESTADO DE CUENTA NACIONAL DE TARJETA DE CRÉDITO'] + i = [i for i, w in enumerate(words) if 'FECHA ESTADO DE CUENTA' in w][0] + 2 + output += extract_from_to(words, 'NOMBRE DEL TITULAR', end=i, line_length=2) + output += ['I. INFORMACIóN GENERAL'] + i = [i for i, w in enumerate(words) if 'CUPO TOTAL' in w][1] + output += extract_from_to(words, 'CUPO TOTAL', end=i, line_length=3) + output += extract_from_to(words, i, end='ROTATIVO', line_length=4) + output += extract_from_to(words, 'ROTATIVO', end='TASA INTERÉS VIGENTE', line_length=3) + output += extract_from_to(words, 'TASA INTERÉS VIGENTE', + end='CAE se calcula sobre un supuesto de gasto mensual de UF 20 y pagadero en 12 cuotas.', + line_length=4) + output += extract_from_to(words, 'DESDE', end='PERÍODO FACTURADO', line_length=2) + output += extract_from_to(words, 'PERÍODO FACTURADO', end='II.', line_length=3) + output += ['II. DETALLE'] + output += extract_from_to(words, '1. PERÍODO ANTERIOR', end='SALDO ADEUDADO INICIO PERÍODO ANTERIOR', line_length=3) + i = words.index('2. PERÍODO ACTUAL') + output += extract_from_to(words, 'SALDO ADEUDADO INICIO PERÍODO ANTERIOR', end=i - 1, line_length=2, + merge_list=[['MONTO FACTURADO A PAGAR (PERÍODO ANTERIOR)', '(A)']], merge_character=" ") + output += ['2. PERÍODO ACTUAL'] + output += extract_from_to(words, 'LUGAR DE', end='1.TOTAL OPERACIONES', line_length=7, + merge_list=[['OPERACIÓN', 'O COBRO'], ['TOTAL A', 'PAGAR'], ['VALOR CUOTA', 'MENSUAL']]) + i = words.index('1.TOTAL OPERACIONES') + 3 + output += extract_from_to(words, '1.TOTAL OPERACIONES', end=i, line_length=3) + output += extract_from_to(words, i, end='TOTAL PAGOS A LA CUENTA', line_length=7) + i = words.index('TOTAL PAGOS A LA CUENTA') + 2 + output += extract_from_to(words, 'TOTAL PAGOS A LA CUENTA', end=i, line_length=2) + output += extract_from_to(words, i, end='TOTAL PAT A LA CUENTA', line_length=8) + i = words.index('TOTAL PAT A LA CUENTA') + 2 + output += extract_from_to(words, 'TOTAL PAT A LA CUENTA', end=i, line_length=2) + output += extract_from_to(words, i, end=i + 3, line_length=2, + merge_list=[ + ['2.PRODUCTOS O SERVICIOS VOLUNTARIAMENTE CONTRATADOS SIN MOVIMIENTOS', '(C)']], + merge_character=" ") + if '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS' in words: + i = words.index('3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS') + 3 + output += extract_from_to(words, '3.CARGOS, COMISIONES, IMPUESTOS Y ABONOS', end=i, line_length=3) + return output + + +def basic(text): + return split_words(text) + + +def split_words(text): + if isinstance(text, list): + text = "\n\n\n".join(text) + words = [t.strip() for t in text.split("\n") if t.strip() != ''] + return words + + +def extract_from_to(word_list, start, line_length, end=None, merge_list=None, merge_character="\n"): + if not isinstance(start, int): + start = word_list.index(start) if end is not None: - return extract_by_line(word_list[word_list.index(start):word_list.index(end)], line_length, merge_list) - return extract_by_line(word_list[word_list.index(start):], line_length, merge_list) + if not isinstance(end, int): + end = word_list.index(end) + return extract_by_line(word_list[start:end], line_length, merge_list, merge_character) + return extract_by_line(word_list[start:], line_length, merge_list, merge_character) -def extract_by_line(word_list, line_length, merge_list=None): +def extract_by_line(word_list, line_length, merge_list=None, merge_character="\n"): if merge_list is not None: - word_list = merge_words(word_list, merge_list) + word_list = merge_words(word_list, merge_list, merge_character) output = [] line = [] for k, w in enumerate(word_list): @@ -54,22 +118,39 @@ def extract_by_line(word_list, line_length, merge_list=None): return output -def merge_words(word_list, merge_list): +def merge_words(word_list, merge_list, merge_character): for m in merge_list: - i = word_list.index(m[0]) - word_list = word_list[:i] + ["\n".join(m)] + word_list[i+len(m):] + ixs = find_words(word_list, m) + if ixs is None: + continue + for i in ixs: + word_list = word_list[:i] + [merge_character.join(m)] + word_list[i + len(m):] return word_list -def extract_data(word_list, start, line_length, end=None, merge_list=None, date_sep='/'): +def find_words(word_list, find_list): + ixs = [i for i, w in enumerate(word_list) if find_list[0] == w] + output = [] + for i in ixs: + mistake = False + for k, m in enumerate(find_list): + if m != word_list[i + k]: + mistake = True + break + if mistake: + continue + output.append(i) + return output + + +def extract_data(word_list, start, line_length, end=None, merge_list=None, merge_character="\n", date_sep='/'): word_list = word_list[word_list.index(start):] if end is not None: word_list = word_list[:word_list.index(end)] if merge_list is not None: - word_list = merge_words(word_list, merge_list) + word_list = merge_words(word_list, merge_list, merge_character) output = [] line = [] - line_num = 0 col = 0 for k, w in enumerate(word_list): if col > 0 and col % line_length == 0: @@ -87,4 +168,5 @@ def extract_data(word_list, start, line_length, end=None, merge_list=None, date_ continue line.append(w) col += 1 + output.append(line) return output diff --git a/python/src/main.py b/python/src/main.py index 229b132..bcbd999 100644 --- a/python/src/main.py +++ b/python/src/main.py @@ -3,22 +3,51 @@ import os import contabilidad.pdf as pdf import contabilidad.text_handler as th +from contabilidad.log import Logger, LOG_LEVEL +import ai.dictionary as dictionary +from ai.network import AI + + +def parse_settings(args): + output = {'filename': args.filename} + if not os.path.isfile(output['filename']): + output['filename'] = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) + t = args.filename.split('.') + output['temp'] = os.path.realpath(os.path.join(os.path.dirname(output['filename']), t[0] + '-temp.pdf')) + output['dictionary'] = os.path.join(os.path.dirname(output['filename']), 'dictionary.json') + output['network'] = os.path.join(os.path.dirname(output['filename']), 'network.json') + output['log_file'] = args.log_file + if not os.path.isfile(output['log_file']): + output['log_file'] = os.path.join(os.path.dirname(os.path.dirname(output['filename'])), output['log_file']) + output['error_log_file'] = os.path.join(os.path.dirname(output['log_file']), 'error.log') + output['logger'] = Logger() + output['logger'].add_log(output['log_file']).add_log(output['error_log_file'], LOG_LEVEL.ERROR) + return output def main(args): - filename = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.filename)) - temp = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', 'data', args.temp_filename)) - pdf.remove_encryption(filename, args.password, temp) - obj = pdf.get_data(temp) - obj = pdf.get_text(filename, args.password) - text = th.text_cleanup(obj, filename=str(args.filename)) - os.remove(temp) + settings = parse_settings(args) + + print('Loading AI') + network = AI(settings['dictionary'], settings['logger']) + network.set_filename(settings['network']) + network.add_source({'filename': settings['filename'], 'password': args.password}) + network.process_sources() + exit() + + print('Loading dictionary.') + dictio = dictionary.Dictionary(settings['dictionary'], settings['logger']) + print('Getting possible phrases.') + dictio.process(settings['filename'], args.password) + dictio.to_data() + # print('Saving dictionary.') + # dictio.save() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', '--filename', type=str) parser.add_argument('-p', '--password', type=str, default='') - parser.add_argument('-t', '--temp_filename', type=str) + parser.add_argument('-l', '--log_file', type=str, default=None) _args = parser.parse_args() main(_args)