from mrjob.job import MRJob
from mrjob.step import MRStep

class MRSessionCount(MRJob):

    def mapper_duplicates(self, _, line):
        # format -> S: G1, G2, G3, ...
        session, cat_str = line.split(':')
        categories       = cat_str.replace(' ', '').split(',')
        for c in categories:
            yield (session, c), 1

    def reducer_duplicates(self, record, _):
        yield (record[0], record[1]), None


    def mapper_unique(self, record, _):
        yield record[1], 1


    def reducer_unique(self, category, counts):
        yield category, sum(counts)


    def steps(self):
        return [
            MRStep(mapper=self.mapper_duplicates,
                   reducer=self.reducer_duplicates),
            MRStep(mapper=self.mapper_unique,
                   reducer=self.reducer_unique)]


if __name__ == "__main__":
    MRSessionCount().run()
