#!/usr/bin/perl
# This script parses:
# https://unicode.org/emoji/charts/full-emoji-list.html
# and creates a mys kmap file from it.
# Gaspar Sinai <gaspar yudit org>
# 2020-06-06

use HTML::TagParser;
use Data::Dumper;{package Data::Dumper;sub qquote{return shift;}}$Data::Dumper::Useperl=1;


# my $html = HTML::TagParser->new('https://unicode.org/emoji/charts/full-emoji-list.html');

# 13.0.0
#my $html = HTML::TagParser->new ('/home/gsinai/Documents/Unicode/ftp.unicode.org/full-emoji-list.html');
my $html = HTML::TagParser->new ('/home/gsinai/Documents/Unicode/ftp.unicode.org/full-emoji-list-15.0.0.html');

my @tables = $html->getElementsByTagName("table");

printf ("# Generated by emoji.pl from Unicode 15.0.0 CLDR Data\n");
foreach my $table (@tables) {
    &parse ($table);
}

exit (0);

sub parse { 
    my $table = $_[0];
    my @trs = $table->subTree()->getElementsByTagName("tr");;
    foreach my $tr (@trs) {
# print Dumper $tr;
        my @tds = $tr->subTree()->getElementsByTagName("td");;
        # next unless ($tr);
        # next unless ($tr->can("tagName"));
        next unless ($#tds > 1);
        my $no = $tds[0]->innerText;
        next unless ($no =~ /\d+/);
        my $code = $tds[1]->innerText;
        next unless ($code =~ /U\+[0-8A-Z]+.*/);
        $code =~ s/U\+//g;
        # can be empty cells.
        #my $value = $tds[14]->innerText;
        my $value = $tds[$#tds]->innerText;
        $value =~ s/⊛ //g;
        print "$code,$value\n";
    }
}
