draco.pl

#!/usr/bin/perl

use strict;
use warnings;
use Time::Piece;

use HTTP::Tiny;
use JSON::MaybeXS;

# For wrapping comment blocks.
use Unicode::LineBreak;
my $lb = Unicode::LineBreak->new(ColMax => 76); # Default is 76.

# Printing UTF-8 to STDOUT.
binmode(STDOUT, "encoding(UTF-8)");

die "usage: draco [-dhv] <url>\n" unless scalar @ARGV;

my $DEBUG;
my $VERSION = "v0.3.3";
# Dispatch table to be parsed before url.
my %dispatch = (
    '-v'  => sub { print "Draco $VERSION\n"; exit; },
    '-d'  => sub { $DEBUG = 1; print STDERR "draco: debug on.\n"; },
    '-h'  => sub { print qq{Draco $VERSION

Options:
    -d
        Turn on debug messages. Debug messages will be printed to
        STDERR.
    -h
        Print this help.
    -v
        Print version.

Environment Variables:
    FETCH_ALL
        Fetch all comments. This will make multiple HTTP calls to
        reddit.
};
                   exit;
               },
);
if (exists $dispatch{$ARGV[0]}) {
    # shift @ARGV to get $url in next shift.
    $dispatch{shift @ARGV}->();
}

# This is the start time.
my $start_time = time;
my $last_log = $start_time;

# $url contains the reddit post. Raise the limit to 500 comments which
# is the maximum reddit allows.
my $url = shift @ARGV;
my $json_url = "${url}.json?limit=500&sort=top";

my $http = HTTP::Tiny->new( verify_SSL => 1 );

# Fetch the post.
print_time() if $DEBUG;
print STDERR "fetching `$json_url'.\n" if $DEBUG;
my $response = get_response($json_url);

# Decode json.
print STDERR "decoding json response.\n" if $DEBUG;
my $json_data = decode_json($response->{content});

# $post contains post data
my $post = $json_data->[0]->{data}->{children}->[0]->{data};

# $comments contains comment data. We are interested in: replies,
# author, body, created_utc & permalink.
my $comments = $json_data->[1]->{data}->{children};

# Start the Org document.
print "#+", "STARTUP:content\n";

# Print the date.
my $current_date = Time::Piece->new->strftime('%+');
print "#+", "DATE: $current_date\n";
print "\n";

# Print the post title & it's link.
print "* ", "[[$post->{url}][$post->{title}]]\n";

# Add various details to :PROPERTIES:.
print ":PROPERTIES:\n";
# Include the created date & archive date in properties.
print ":CREATED_UTC: ",
    Time::Piece->strptime($post->{created_utc}, '%s')
    ->strftime('%+'), "\n";

print ":ARCHIVE_DATE: $current_date\n";
foreach my $detail (qw( subreddit created_utc author permalink
                        upvote_ratio ups downs score )) {
    print ":${detail}: =$post->{$detail}=\n"
        if scalar $post->{$detail};
}
print ":END:\n";

# Add selftext if present.
print "\n#+BEGIN_SRC markdown\n",
    # Break the text at 76 column & add 2 space before every new line.
    "  ", $lb->break($post->{selftext}) =~ s/\n/\n\ \ /gr, "\n",
    "#+END_SRC\n"
    if scalar $post->{selftext};

my (@http_calls, %counter);
$counter{print_comment_chain_call} = 0;
$counter{iterate_over_comments_call} = 0;

print_time() if $DEBUG;
print STDERR "iterating over top-level comments.\n" if $DEBUG;

# We are going to put a dot after each HTTP call. Dot on first HTTP
# call is skipped so we print this message only if user has set
# `FETCH_ALL' because that'll be the reason Draco makes multiple HTTP
# calls.
print STDERR "each dot is a HTTP call.\n"
    if $DEBUG and $ENV{FETCH_ALL};

# Iterate over top-level comments. The second argument is level
# (depth), it should be 0 for top-level comments.
iterate_over_comments($comments, 0);

# Seperate the dots with the rest by a "\n".
print STDERR "\n"
    if $DEBUG and $ENV{FETCH_ALL};

print_time() if $DEBUG;

# Print important stats.
print STDERR "\n" if $DEBUG;
print STDERR "total http calls: ",
    scalar @http_calls, "\n" if $DEBUG;
print STDERR "total print_comment_chain calls: ",
    $counter{print_comment_chain_call}, "\n" if $DEBUG;
print STDERR "total iterate_over_comments calls: ",
    $counter{iterate_over_comments_call}, "\n" if $DEBUG;

sub print_time {
    print STDERR "    ";
    print STDERR "time since [start, last log]: [", time - $start_time,
        ", ", time - $last_log, "] seconds\n";
    $last_log = time;
}

sub get_response {
    my $url = shift @_;
    # Skip a dot on first HTTP call.
    print STDERR "." if $DEBUG and scalar @http_calls > 0;
    my $response = $http->get($url);
    push @http_calls, $url;
    die "\nUnexpected response - $response->{status}: $response->{reason}\n$response->{content} : $url"
        unless $response->{success};
    return $response;
}

# Pass <comment id> as argument and it'll return you the json url to
# that comment thread.
sub get_comment_thread_from_id {
    my $comment_id = shift @_;

    # Reddit doesn't like this kind of url:
    #     http://<reddit>/<post_id>//<comment_id>.json
    #
    # It wants this kind of url:
    #     http://<reddit>/<post_id>/<comment_id>.json
    #
    # Notice the extra '/' in first url.
    my $json_url = $url;
    $json_url .= "/" unless substr $url, -1 eq "/";
    $json_url .= "${comment_id}.json?limit=500&sort=top";
    return $json_url;
}

# This was being used multiple times so I moved it to a subroutine.
# It'll take $comment_id & return $comments.
sub get_all_comments_from_id {
    my $comment_id = shift @_;
    my $json_url = get_comment_thread_from_id($comment_id);

    # Fetch the comment.
    my $response = get_response($json_url);

    # Decode json.
    my $json_data = decode_json($response->{content});

    # $comments contains comment data.
    my $comments = $json_data->[1]->{data}->{children};

    return $comments;
}

# First argument requires $comments & second is the level (depth).
sub iterate_over_comments {
    my $comments = shift @_;
    my $level = shift @_;

    $counter{iterate_over_comments_call}++;

    foreach my $comment ($comments->@*) {
        my $comment_data = $comment->{data};

        # There are 3 kind of comments.
        #
        # 1. normal comments (includes top-level comments & replies).
        # 2. comments hidden under "load more comments".
        # 3. comments hidden under "continue this thread".

        # We will be dealing with them in reverse order, i.e. 3rd ->
        # 2nd -> 1st.

        # This comment we are skipping is the third kind of comment,
        # i.e. comment hidden under "continue this thread".
        if ($comment->{kind} eq "more"
            and $comment_data->{id} eq "_") {
            # $comment_data->{parent_id} starts with "t1_" so we get
            # the {id} from there.
            my $comment_id = substr $comment_data->{parent_id}, 3;

            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            unless ( eval {
                # get $comments.
                my $comments = get_all_comments_from_id($comment_id);

                # 0th index will contain the comment we are looking for.
                my $comment_data = $comments->[0]->{data};

                # We already have printed this comment so we will just
                # look it has replies, if it does then we'll print
                # those.
                if (scalar $comment_data->{replies}) {
                    iterate_over_comments(
                        $comment_data->{replies}->{data}->{children},
                        $level + 1);
                }
                return 1;
            } ) {
                my $err = $@;
                # Keep the "\n" at the start.
                warn "\nparsing `$comment_id' failed: $err\n";
            }

            # This comment thread has been parsed, move on to the text
            # one.
            next;
        }

        # These are second kind of comments, i.e. comments hidden
        # under "load more comments". Their kind is "more" & they have
        # an id. This part is a bit complex so read the comments.
        #
        # We can get it by making another HTTP call. This is skipped
        # by default & user has to pass `FETCH_ALL' to enable it.
        if ($comment->{kind} eq "more"
                and $comment_data->{id}) {
            # Don't proceed unless user has set `FETCH_ALL'.
            next unless $ENV{FETCH_ALL};

            # The problem here is that if the thread is too large then
            # at the end reddit will include similar block in which
            # it'll put all other top-level comments in children of
            # 2nd kind.
            #
            # "kind": "more",
            # "data": {
            # "id": "gde31fk",
            # ...
            # "children": [
            # "gde31fk",
            # "gdbrnyd",
            # ...
            #
            # If the "load more comments" only hides a single thread
            # then it's the only one included in "children". Note how
            # the "id" & first element of "children" is same.
            #
            # So, reddit wants us to check for the length of this list
            # "children", if it is greater than 1 then we need to pull
            # those comments independently. If not then we just have
            # to pull the "id".

            if ($comment_data->{children}
                and scalar $comment_data->{children} < 2) {
                unless ( eval {
                    # get $comments.
                    my $comments = get_all_comments_from_id(
                        $comment_data->{id}
                    );

                    # Now this is like a normal comment chain, i.e.
                    # first kind of comment. We just have to iterate
                    # over it & pass to print_comment_chain,
                    # iterate_over_comments will handle it.
                    iterate_over_comments($comments, $level);
                    return 1;
                } ) {
                    my $err = $@;
                    # Keep the "\n" at the start.
                    warn "\nparsing `$comment_data->{id}' failed: $err\n";
                }
            } else {
                # If we reach this block then it means that multiple
                # comments are hiding under "load more comments", we
                # will make one call for each comment, this can mean a
                # lot of HTTP calls.
                foreach my $comment_id ($comment_data->{children}->@*) {
                    unless ( eval {
                        # get $comments.
                        my $comments = get_all_comments_from_id($comment_id);

                        # Now this is like a normal comment chain, i.e.
                        # first kind of comment. We just have to iterate
                        # over it & pass to print_comment_chain,
                        # iterate_over_comments will handle it.
                        iterate_over_comments($comments, $level);
                        return 1;
                    } ) {
                        my $err = $@;
                        # Keep the "\n" at the start.
                        warn "\nparsing `$comment_data->{id}' failed: $err\n";
                    }
                }
            }
            # This comment thread has been parsed, move on to the next
            # one.
            next;
        }
        # This is first kind of comment, we can pass it directly to
        # print_comment_chain.
        print_comment_chain($comment_data, $level);
    }
}

# print_comment_chain will print the whole chain of comment while
# accounting for level. It can only parse the first kind of comment,
# i.e. top-level comments & their replies. To learn about kinds of
# comments, check iterate_over_comments() subroutine.
sub print_comment_chain {
    # This was earlier called $comment & was changed to $comment_data
    # to prevent confusion because it is $comment->{data}.
    my $comment_data = shift @_;
    my $level = shift @_;

    $counter{print_comment_chain_call}++;

    print "*" x ($level + 2), " ";
    # If the author name starts & ends with "_" then Org will
    # underline it, so we put those names in a code block.
    if (substr($comment_data->{author}, 0, 1) eq "_"
        and substr($comment_data->{author}, -1) eq "_") {
        print "=$comment_data->{author}=";
    } else {
        print "$comment_data->{author}";
    }
    print " [S]" if $comment_data->{is_submitter};
    print "\n";

    # Print comment details.
    print ":PROPERTIES:\n";
    print ":CREATED_UTC: ",
        Time::Piece->strptime($comment_data->{created_utc}, '%s')
          ->strftime('%+'), " (=$comment_data->{created_utc}=)\n";
    foreach my $detail (qw( author permalink score edited stickied
                            controversiality author_flair_text )) {
        print ":${detail}: =$comment_data->{$detail}=\n"
            if scalar $comment_data->{$detail};
    }
    print ":END:\n";

    print "\n#+begin_src markdown\n",
        # Break the text at 76 column & add 2 space before every new
        # line.
        "  ", $lb->break($comment_data->{body}) =~ s/\n/\n\ \ /gr, "\n",
        "#+end_src\n\n";

    # If the comment has replies then iterate over those too.
    if (scalar $comment_data->{replies}) {
        iterate_over_comments($comment_data->{replies}->{data}->{children},
                              $level + 1);
    }
}