#!/usr/bin/perl

#############################################################################
#                                                                           #
# Hulu video parsing script                                                 #
# http://www.lazorsoftware.com                                              #
#                                                                           #
# Based upon gethulu.pl as included with MythVodka                          #
# http://code.google.com/p/mythvodka/                                       #
#                                                                           #
# Copyright (C) 2009 Lazor Software                                         #
#                                                                           #
#  This program is free software; you can redistribute it and/or modify     #
#  it under the terms of the GNU General Public License as published by     #
#  the Free Software Foundation; either version 2 of the License, or        #
#  (at your option) any later version.                                      #
#                                                                           #
#  This program is distributed in the hope that it will be useful,          #
#  but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#  GNU General Public License for more details.                             #
#                                                                           #
#  The full GPL license can be found on the GNU web site:                   #
#  http://www.gnu.org/licenses/gpl.txt                                      #
#                                                                           #
#############################################################################

use strict;
use warnings;
use Pod::Usage;
use Getopt::Long qw(:config bundling);
use XML::DOM;
use Data::Dumper;
use LWP::Simple;
use Encode;
use HTML::Entities;
use JSON -support_by_pp;

# constants
my $VER     = "1.0.2 - Lazor Software";
my $VERDATE = "04/06/2009";

# force buffer flushing to see output quicker
local $| = 1;

# define the defaults
my $additional  = 0;
my $cache_info  = 0;
my $outfile     = '&=1';
my $skip_movies = 0;
my $skip_tv     = 0;
our $verbose    = 0;
our $DEBUG      = 0;

# get the command line options
GetOptions(
    # script specific parameters
    'additional|a'  => \$additional,
    'cache|c'       => \$cache_info,
    'outfile|o=s'   => \$outfile,
    'skip-movies|m' => \$skip_movies,
    'skip-tv|t'     => \$skip_tv,
    # standard parameters
    'verbose|v'     => \$verbose,
    'debug'         => \$DEBUG,
    'version|V'     => sub { print "$0 v$VER ($VERDATE)\n"; exit; },
    'help|?'        => sub { pod2usage(1) },
    );

# debug implies verbose
$verbose = 1 if ($DEBUG);



##########################################
###           PROGRAM BEGINS           ###
##########################################


open (MYTHMENU, ">$outfile") or die("Cannot write to outfile: $outfile");
print MYTHMENU "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
print MYTHMENU "<MediaStreams>\n";

if (!$skip_movies)
{
    grab_feeds('Hulu Movies', 'http://www.hulu.com/browse/alphabetical/feature_film', 'full-mov-icn');
}

if (!$skip_tv)
{
    grab_feeds('Hulu', 'http://www.hulu.com/browse/alphabetical/episodes', 'full-eps-icn');
}

print MYTHMENU '</MediaStreams>',"\n";
close (MYTHMENU);


sub grab_feeds
{

    my $provider  = shift;
    my $feed_url  = shift;
    my $feed_tag  = shift;

    my $feed_list = get($feed_url);
    my @feeds = grep(/$feed_tag/, split("\n", $feed_list));

    foreach my $feed (@feeds)
    {

        my @parts = split(/'/, $feed);
        my $url = $parts[1];
        
        @parts = split("/", $url);

        my $image = $parts[3];
        $image =~ s/\-/\_/g;
        $image =  "http://assets.hulu.com/shows/show_thumbnail_" . $image . ".jpg";

        print $url . "\n" if ($verbose);
        my $feed_html = get($url);

        my $title = '';

        if ($feed_html =~ m/title>(.*)<\/title/)
        {
            $title =  clean_text($1);
            $title =~ s/Hulu - //g;
        }

        print MYTHMENU "<Feed><Name>$title</Name><Provider>$provider</Provider><FeedImage>$image</FeedImage><Streams>\n";

        my $show_id;
        my @stream_ids_raw = ();
        my %stream_seasons;
        my %stream_types;

        # look for videos grouped by season first
        if ($feed_html =~ m/show\/(\d+)\/episode/)
        {
            $show_id = $1;
        
            # grab stream IDs from the individual season pages, if present
            while ($feed_html =~ m/season_number=(\d+)&amp/g)
            {
                my $season = $1;
                $url = "http://www.hulu.com/videos/season_expander?current_video_id=0&order=desc&page=1&season_number=$season&show_id=$show_id&sort=season&video_type=episode";
                print $url . "\n" if ($verbose);
                my $stream_html = get($url);

                while ($stream_html =~ m/watch\/(\d+)/g)
                {
                    push(@stream_ids_raw, $1);
                    $stream_seasons{$1} = $season;
                }
            }
        }

        # look for additional types of streams
        if ($additional)
        {

            my %additional_streams = (
                                        'Clips'     => 'Clip',
                                        'Webisodes' => 'Webisode',
                                     );

            my @sliders = grep(/new VideoSlider/, split("\n", $feed_html));

            foreach my $slider (@sliders)
            {

                my $last_page = 0;
                my $type      = '';
                my $category  = '';
                my $per_page  = 0;
                my $max_count = 0;

                $type      = $1 if ($slider =~ m/type: "(.*?)"/);
                $category  = $1 if ($slider =~ m/category: "(.*?)"/);
                $per_page  = $1 if ($slider =~ m/items_per_page: (\d+)/);
                $max_count = $1 if ($slider =~ m/maxCount: (\d+)/);

                # :KLUGE: request for /videos/expander always returns (up to) 10 per page?
                $per_page = 10;

                next if (!defined $additional_streams{$category});

                # calculate the highest page number
                if ($per_page > 0 && $max_count > 0)
                {
                    $last_page = int(($max_count - 1) / $per_page) + 1;
                }

                if ($category ne '' && $last_page > 0)
                {

                    my $page = 1;
                    while ($page <= $last_page)
                    {

                        $url = "http://www.hulu.com/videos/expander?category=$category&order=desc&page=$page&show_id=$show_id&sort=original_premiere_date&video_type=categorical";
                        print $url . "\n" if ($verbose);
                        my $stream_html = get($url);

                        while ($stream_html =~ m/watch\/(\d+)/g)
                        {
                            push(@stream_ids_raw, $1);
                            $stream_seasons{$1} = 0;
                            $stream_types{$1}   = $additional_streams{$category};
                        }

                        $page++;

                    }
                }
            }

        }

        # no streams found so far? grab streams listed directly on the feed page
        if ($#stream_ids_raw == -1)
        {
            while ($feed_html =~ m/watch\/(\d+)/g)
            {
                push(@stream_ids_raw, $1);
                $stream_seasons{$1} = 0;
            }
        }

        # filter out dupe stream IDs
        my @stream_ids = ();
        my %streams_found;
        foreach my $stream_id (@stream_ids_raw)
        {
            if (!defined $streams_found{$stream_id})
            {
                push(@stream_ids, $stream_id);
                $streams_found{$stream_id} = 1;
            }
        }

        foreach my $stream_id (@stream_ids)
        {

            my $season = $stream_seasons{$stream_id};
            my $stream_html = get_stream_info($stream_id);
            my $fields;

            if ($stream_html eq '')
            {
                print "Failed to retrieve stream info: $stream_id\n" if ($verbose);
                next;
            }

            eval
            {
                $fields = from_json($stream_html, { allow_barekey => 1 });
            };

            if ($@)
            {
                print "Failed to retrieve stream info: $stream_id\n" if ($verbose);
                next;
            }

            my $stream_desc     = '';
            my $stream_title    = '';
            my $stream_date     = '';
            my $stream_episode  = '';
            my $stream_season   = $season;
            my $stream_type     = '';
            my $stream_duration = '';
            my $stream_image    = '';

            if (defined $fields->{'description'})      { $stream_desc     = clean_text($fields->{'description'}) };
            if (defined $fields->{'title'})            { $stream_title    = clean_text($fields->{'title'}) };
            if (defined $fields->{'air_date'})         { $stream_date     = clean_text($fields->{'air_date'}) };
            if (defined $fields->{'episode_number'})   { $stream_episode  = clean_text($fields->{'episode_number'}) };
            if (defined $fields->{'season_number'})    { $stream_season   = clean_text($fields->{'season_number'}) };
            if (defined $fields->{'programming_type'}) { $stream_type     = clean_text($fields->{'programming_type'}) };
            if (defined $fields->{'duration'})         { $stream_duration = clean_text($fields->{'duration'}) };
            if (defined $fields->{'thumbnail_url'})    { $stream_image    = clean_text($fields->{'thumbnail_url'}) };

            # prefix title with stream type, if set
            $stream_title = $stream_types{$stream_id} . ' - ' . $stream_title if (defined $stream_types{$stream_id});

            print MYTHMENU "<Stream>\n";
            print MYTHMENU "<Name>$stream_title</Name>\n";
            print MYTHMENU "<Url>http://www.hulu.com/watch/$stream_id</Url>\n";
            print MYTHMENU "<Subtitle>S${stream_season}E$stream_episode</Subtitle>\n";
            print MYTHMENU "<AirDate>$stream_date</AirDate>\n";
            print MYTHMENU "<Synopsis>$stream_desc</Synopsis>\n";
            print MYTHMENU "<RunningTime>$stream_duration</RunningTime>\n";
            print MYTHMENU "<Type>$stream_type</Type>\n";
            print MYTHMENU "<StreamImage>$stream_image</StreamImage>\n";
            print MYTHMENU "</Stream>\n";


        }

        print MYTHMENU "</Streams></Feed>\n";

    }

}

sub get_stream_info
{

    my $stream_id   = shift;
    my @results     = ();
    my $stream_html = '';
    my $tmp_file    = "/tmp/hulu_$stream_id";

    print "http://www.hulu.com/videos/info/" . $stream_id if ($verbose);

    if ($cache_info && -e $tmp_file && -s $tmp_file > 0)
    {
        print " (cached in $tmp_file) ...\n" if ($verbose);

        open(FH, $tmp_file) or die("Cannot open info file $tmp_file.\n");
        my @results = <FH>;
        $stream_html = join("\n", @results);
    }
    else
    {
        print "\n" if ($verbose);

        $stream_html = get("http://www.hulu.com/videos/info/" . $stream_id);

        if ($cache_info)
        {
            open(FH, ">$tmp_file") or die("Cannot open info file $tmp_file.\n");
            print FH $stream_html;
            close(FH);
        }
    }

    return $stream_html;

}

sub clean_text
{
    my $text = decode_entities(shift);

    # try to convert some "extended" characters (like Microsoft Smart Quotes) into standard characters
    $text =~ s/\x{00a0}/ /g;
    $text =~ s/(\x{00ad}|\x{2013}|\x{2014})/-/g;
    $text =~ s/(\x{2018}|\x{2019}|\x{2099}|\\u2018|\\u2019|\\u2099)/'/g;
    $text =~ s/(\x{201c}|\x{201d}|\x{209c}|\x{209d}|\\u201c|\\u201d|\\u209d|\\u209d)/"/g;
    $text =~ s/(\x{2026}|\x{20a6}|\\u2026|\\u20a6|\xe2\x80\xa6)/.../g;

    $text = encode('utf8', encode_entities($text, '<>&"'));

    return $text;
}



__END__

=head1 NAME

gethulu.pl - Retrieve movie and TV information from Hulu

=head1 SYNOPSIS

 gethulu.pl [options] [-o outfile]

 Options:
    -a, --additional    Grab additional available streams (clips, webisodes, etc)
                        for each feed.
    -c, --cache         Cache downloaded video info locally
    -o, --outfile       File used for XML output (Default: stdout)
        --skip-movies   Do not download information for movies
        --skip-tv       Do not download information for TV shows

        --debug         Debug mode. Verbose printing of what is going on.
    -v, --verbose       Verbose mode. Print out a little bit of helpful info
                        NOTE: Using this option without the -o option will
                        prevent the output from being valid XML.
    -V, --version       Return the version of the script.
    -?, --help          This screen

=head1 OPTIONS

=over 8

=item B<-a, --additional>

Grab additional available streams (clips, webisodes, etc)
for each feed.

=item B<-c, --cache>

Cache downloaded video info locally

=item B<-o, --outfile>

File used for XML output (Default: stdout)

=item B<--skip-movies>

Do not download information for movies

=item B<--skip-tv>

Do not download information for TV shows

=item B<--debug>

Debug mode. Verbose printing of what is going on.

=item B<-V, --verbose>

Verbose mode. Print out a little bit of helpful info
NOTE: Using this option without the -o option will
prevent the output from being valid XML.

=item B<-v, --version>

Return the version of the script.

=back

=head1 DESCRIPTION

This script will extract movie and TV episode information from Hulu and dump
the results to XML. This XML can then be parsed by other third party programs.

=cut
