#!/usr/bin/perl
# 
# Copyright (c) 2005 Matthias Bauer <matthiasb@acm.org>
#
# This code is published under the MIT license 
# (http://www.opensource.org/licenses/mit-license.php)
# 
# Convert an HTML file with links to a YAML file for inclusion
# in Rubric (http://search.cpan.org/dist/Rubric/) 
# or de.lirio.us (http://de.lirio.us/code).
# Works well for lynx bookmarks.
# If the input is of DOCTYPE NETSCAPE-Bookmark-file-1
# the additional attributes (id, add_date, last_modified)
# are passed on to Rubric.

use strict;
use warnings "all";

use HTML::TokeParser;
use YAML;

my $bookmarks;
{
	local $/;
	$bookmarks = <>;
}

my @entries;
my $id=1;

my $p = HTML::TokeParser->new(\$bookmarks) or die "Mist $@";
while (my $t = $p->get_tag ("a" , "A")) {
	my %lh;

	$lh{href} = $t->[1]->{href} or die "'a' tag without href in bookmarks";

	# Netscape and older Mozilla Bookmarks have
	# 'id', 'add_date' and 'last_modified' attributes
	$lh{created} = $t->[1]->{add_date} || scalar time;
	$lh{modified} = $t->[1]->{last_modified} || $lh{created};
	$lh{id} = $t->[1]->{id} || $id++;

	# The text between <a> and </a> is the title
	my $title = $p->get_text;
	$lh{description} = $title;

	$lh{tags} = [];
	push @entries, \%lh;
}
print YAML::Dump(\@entries);
