From 854d74b798b0bd4f8ebc2ea15bd02f32dcbcf206 Mon Sep 17 00:00:00 2001 From: Daouzli A Date: Mon, 15 Jan 2018 23:25:31 +0100 Subject: [PATCH] add letters (load data letters froms images) and pnmimage (load pbm/pgm) modules --- __init__.py | 0 letters.py | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++ pnmimage.py | 136 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 301 insertions(+) create mode 100644 __init__.py create mode 100644 letters.py create mode 100644 pnmimage.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/letters.py b/letters.py new file mode 100644 index 0000000..9ca5149 --- /dev/null +++ b/letters.py @@ -0,0 +1,165 @@ +import os +from pnmimage import PnmImage + +class LettersData(object): + def __init__(self, data_folder="data/", filename_expected_data="list_expected_data.txt", car_filename_fmt="ext_ln{}_car{}.pgm"): + self.filename_expected_data = filename_expected_data + self.car_filename_fmt = car_filename_fmt + self.data_folder = data_folder + self.expected_data_list = [] + self.expected_letters = [] + self.letter_to_int = {} + self.letters_classes = {} + self.extract_vocab = [] + self.vocab = {} + self._images = [] + self._expected_values = [] + self.nb_vocab = 0 + self.input_image_size = 0 + self.selection_vocab = {} + self.selection_batch = [] + self.letter_to_vector = {} + self.vector_to_letter = {} + + def _gen_ext_filenames_list(self): + for ln in range(100): + for car in range(100): + n = self.data_folder + self.car_filename_fmt.format(ln, car) + if os.path.exists(n) and n.replace(data_folder, '') in self.filenames_list: + yield(n) + + def _get_expected_letters_list(self): + """Get expected letters list from expected data file. + + :return: list of tuples with the letter and the corresponding filename + + """ + data_list = [] + name = self.filename_expected_data + with open(name) as f: + lst = [(line.split(" ")[0], line.split(" ", 1)[1].replace('\n', '')) for line in f.readlines() if line.strip() != ''] + # list of tuples with (letter, filename) + data_list = [(e[1], e[0]) for e in lst] + return data_list + + def _extract_data(self): + """Extract data. + Create list of expected letters/filenames, base letters with info, classes. + """ + self.expected_data_list = self._get_expected_letters_list() + """list of tuples with letter and filename""" + self.extract_vocab = sorted(set([l[0] for l in self.expected_data_list])) + """list of data vocab letters sorted""" + self.nb_vocab = len(self.extract_vocab) + + self.vocab = {} + letters_list = [l[0] for l in self.expected_data_list] + for i, c in enumerate(self.extract_vocab): + vec = [0] * self.nb_vocab + vec[i] = 1 + self.vocab[c] = { + 'index': i, + 'count': letters_list.count(c), + 'vector': vec + } + self.letter_to_int = {c: i for i, c in enumerate(self.extract_vocab)} + """dict of letters with index of each""" + self.letters_classes = {} + """dict of letters with vector representation of each letter""" + + for letter, idx in self.letter_to_int.items(): + cls = [0] * self.nb_vocab + cls[idx] = 1 + self.letters_classes[letter] = cls + + def get_vocab_with_min_count(self, min_count): + """Get the vocab. A dictionary of letters with count, index and vector. + The index is re-computed and also the vector to match the number of subelements + """ + subvocab = {} + i = 0 + for c, info in self.vocab.items(): + if info['count'] >= min_count: + subvocab[c] = { + 'count': info['count'], + 'index': i, + 'vector': None + } + i += 1 + nb_vocab = len(subvocab) + self.letter_to_vector = {} + self.vector_to_letter = {} + for c in subvocab: + vec = [0] * nb_vocab + vec[subvocab[c]['index']] = 1 + subvocab[c]['vector'] = vec + self.letter_to_vector[c] = vec + return subvocab + + def get_letter_of_vector(self, vector): + """Get the letter corresponding to a given vector. + + :return: the found letter else None + """ + ret = None + for letter, vec in self.letter_to_vector.items(): + if vec == vector: + ret = letter + break + return ret + + def get_batches(self, min_count=0, mini_batch_size=None): + """Get the selection data based on min count of letters in the dataset + + :param min_count: minimal count of same letters to be added (default 0 for the whole dataset) + :param mini_batch_size: size of a mini batch, if None the whole dataset size (default None) + if whole size is not factor of the mini batch size then the last mini batch + has a size < mini batch size + :return: a list of mini batches (at least list of one) + """ + if mini_batch_size is None: + mini_batch_size = len(self.expected_data_list) + self.selection_vocab = self.get_vocab_with_min_count(min_count) + self.selection_batch = [] + X = [] + Y = [] + bsize = 0 + for letter, name in self.expected_data_list: + path = '{}{}'.format(self.data_folder, name) + if letter in self.selection_vocab: + img = PnmImage() + if img.load(path) == False: + print("ERROR: failed to open '{}'".format(path)) + image_size = img.get_size() + bsize += 1 + X.append(img.get_data_bin()) + Y.append(self.selection_vocab[letter]['vector']) + if bsize >= mini_batch_size: + self.selection_batch.append((X, Y)) + X = [] + Y = [] + bsize = 0 + if bsize > 0: + self.selection_batch.append((X, Y)) + return self.selection_batch + + def process(self): + self._extract_data() + + def get_data_with_min_count(self, min_count): + vocab = self.get_vocab_with_min_count(min_count) + #todo + return images, expected_values + + def get_vocab(self): + """Get the vocab. A dictionary of letters with count, index and vector""" + return self.vocab + + def get_classes(self): + return self.letters_classes + + def get_class_element_size(self): + return self.nb_vocab + + def get_input_image_size(self): + return self.input_image_size \ No newline at end of file diff --git a/pnmimage.py b/pnmimage.py new file mode 100644 index 0000000..4483c1c --- /dev/null +++ b/pnmimage.py @@ -0,0 +1,136 @@ +import struct + +class PnmImage(object): + + def __init__(self): + self._filename = None + self._image = [] + self._version = None + self._maxval = None + self._width = 0 + self._height = 0 + + def reset(self): + """Reset all image data (dims, data,...)""" + self._filename = None + self._image = [] + self._version = None + self._maxval = None + self._width = 0 + self._height = 0 + + def _load_bin_pbm(self, fd): + """Load image data from a file of PBM binary format + + :param fd: file descriptor of the file starting to the image data so + the version and dimensions should have been skipped + """ + c = fd.read(1) + i = 0 + size = self._width * self._height + while len(c) > 0: + c = struct.unpack("= size: + break + if i >= size: + break + c = fd.read(1) + + def _load_bin_pgm(self, fd): + """Load image data from a file of PGM binary format + + :param fd: file descriptor of the file starting to the image data so + the version and dimensions should have been skipped + """ + maxval = fd.readline().decode("utf-8").strip() + self._maxval = int(maxval) + c = fd.read(1) + while len(c) > 0: + c = struct.unpack("