__author__ = "Adel Daouzli" __licence__ = "GPLv3" import os from pnmimage import PnmImage class LettersData(object): def __init__(self, data_folder="data/", filename_expected_data="list_expected_data.txt", car_filename_fmt="ext_ln{}_car{}.pgm"): self.filename_expected_data = filename_expected_data self.car_filename_fmt = car_filename_fmt self.data_folder = data_folder self.expected_data_list = [] self.expected_letters = [] self.letter_to_int = {} self.letters_classes = {} self.extract_vocab = [] self.vocab = {} self._images = [] self._expected_values = [] self.nb_vocab = 0 self.input_image_size = 0 self.selection_vocab = {} self.selection_batch = [] self.letter_to_vector = {} self.vector_to_letter = {} def _gen_ext_filenames_list(self): for ln in range(100): for car in range(100): n = self.data_folder + self.car_filename_fmt.format(ln, car) if os.path.exists(n) and n.replace(data_folder, '') in self.filenames_list: yield(n) def _get_expected_letters_list(self): """Get expected letters list from expected data file. :return: list of tuples with the letter and the corresponding filename """ data_list = [] name = self.filename_expected_data with open(name) as f: lst = [(line.split(" ")[0], line.split(" ", 1)[1].replace('\n', '')) for line in f.readlines() if line.strip() != ''] # list of tuples with (letter, filename) data_list = [(e[1], e[0]) for e in lst] return data_list def _extract_data(self): """Extract data. Create list of expected letters/filenames, base letters with info, classes. """ self.expected_data_list = self._get_expected_letters_list() """list of tuples with letter and filename""" self.extract_vocab = sorted(set([l[0] for l in self.expected_data_list])) """list of data vocab letters sorted""" self.nb_vocab = len(self.extract_vocab) self.vocab = {} letters_list = [l[0] for l in self.expected_data_list] for i, c in enumerate(self.extract_vocab): vec = [0] * self.nb_vocab vec[i] = 1 self.vocab[c] = { 'index': i, 'count': letters_list.count(c), 'vector': vec } self.letter_to_int = {c: i for i, c in enumerate(self.extract_vocab)} """dict of letters with index of each""" self.letters_classes = {} """dict of letters with vector representation of each letter""" for letter, idx in sorted(self.letter_to_int.items(), key=lambda x: x[0]): cls = [0] * self.nb_vocab cls[idx] = 1 self.letters_classes[letter] = cls def get_vocab_with_min_count(self, min_count): """Get the vocab. A dictionary of letters with count, index and vector. The index is re-computed and also the vector to match the number of subelements """ subvocab = {} i = 0 for c, info in sorted(self.vocab.items(), key=lambda x: x[0]): if info['count'] >= min_count: subvocab[c] = { 'count': info['count'], 'index': i, 'vector': None } i += 1 nb_vocab = len(subvocab) self.letter_to_vector = {} self.vector_to_letter = {} for c in sorted(subvocab): vec = [0] * nb_vocab vec[subvocab[c]['index']] = 1 subvocab[c]['vector'] = vec self.letter_to_vector[c] = vec return subvocab def get_letter_of_vector(self, vector): """Get the letter corresponding to a given vector. :param vector: a flatten list representing the vector representation of a letter :return: the found letter else None """ ret = None assert(type(vector) is list) for letter, vec in self.letter_to_vector.items(): if vec == vector: ret = letter break return ret def get_batches(self, min_count=0, mini_batch_size=None): """Get the selection data based on min count of letters in the dataset :param min_count: minimal count of same letters to be added (default 0 for the whole dataset) :param mini_batch_size: size of a mini batch, if None the whole dataset size (default None) if whole size is not factor of the mini batch size then the last mini batch has a size < mini batch size :return: a list of mini batches (at least list of one) """ if mini_batch_size is None: mini_batch_size = len(self.expected_data_list) self.selection_vocab = self.get_vocab_with_min_count(min_count) self.selection_batch = [] X = [] Y = [] bsize = 0 for letter, name in self.expected_data_list: path = '{}{}'.format(self.data_folder, name) if letter in self.selection_vocab: img = PnmImage() if img.load(path) == False: print("ERROR: failed to open '{}'".format(path)) image_size = img.get_size() bsize += 1 X.append(img.get_data_bin()) Y.append(self.selection_vocab[letter]['vector']) if bsize >= mini_batch_size: self.selection_batch.append((X, Y)) X = [] Y = [] bsize = 0 if bsize > 0: self.selection_batch.append((X, Y)) return self.selection_batch def process(self): self._extract_data() def get_data_with_min_count(self, min_count): vocab = self.get_vocab_with_min_count(min_count) #todo return images, expected_values def get_vocab(self): """Get the vocab. A dictionary of letters with count, index and vector""" return self.vocab def get_classes(self): return self.letters_classes def get_class_element_size(self): return self.nb_vocab def get_input_image_size(self): return self.input_image_size