#!/usr/bin/env python3
#
# Copyright (C) 2008 Henri Hakkinen
#
# Copyright (C) 2015-2016 Arun Prakash Jana <engineerarun@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function
import argparse
import collections
import gzip
import io
import os
import signal
import sys
import textwrap
import webbrowser

# 2to3 compatibility layer
if sys.version_info > (3,):
    from html.entities import name2codepoint
    import html.parser as HTMLParser
    from urllib.parse import quote_plus as url_quote_plus
    from http.client import HTTPSConnection

    unichr = chr
    raw_input = input
else:
    from htmlentitydefs import name2codepoint
    import HTMLParser
    from urllib import quote_plus as url_quote_plus
    from httplib import HTTPSConnection

    # Set the encoding of standard streams to UTF-8 (unnecessary for Python 3)
    import codecs
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

"""
POSIX compatibility layer
 - os.get_terminal_size is only available on Python 3.3+.
 - fcntl and termios are only available on POSIX systems.

 This version of platform-independent get_terminal_size always returns a tuple
 (columns, lines), just like os.get_terminal_size. When unavailable, returns (0, 0).
"""
def _environ_get_terminal_size():
    return (int(os.environ.get('COLUMNS', 0)), int(os.environ.get('LINES', 0)))

if hasattr(os, 'get_terminal_size'):
    def get_terminal_size():
        """Returns terminal size on Python 3.3+"""

        try:
            return os.get_terminal_size()
        except OSError:
            return _environ_get_terminal_size()
else:
    try:
        import fcntl
        import struct
        import termios

        def get_terminal_size():
            """Returns terminal size on POSIX systems"""

            try:
                winsz = fcntl.ioctl(sys.stderr, termios.TIOCGWINSZ, '1234')
                lines, columns = struct.unpack('HH', winsz)
            except IOError:
                lines, columns = (0, 0)
            if lines > 0 and columns > 0:
                return (columns, lines)
            else:
                return _environ_get_terminal_size()
    except ImportError:
        get_terminal_size = _environ_get_terminal_size

# Python optional dependency compatibility layer
try:
    import readline
except ImportError:
    pass


def sigint_handler(signum, frame):
    """Install SIGINT handler"""

    print('\nInterrupted.', file=sys.stderr)
    sys.exit(1)

# Register signal handler
signal.signal(signal.SIGINT, sigint_handler)


# Constants

COLORMAP = {k: '\x1b[%sm' % v for k, v in {
    'a': '30', 'b': '31', 'c': '32', 'd': '33',
    'e': '34', 'f': '35', 'g': '36', 'h': '37',
    'i': '90', 'j': '91', 'k': '92', 'l': '93',
    'm': '94', 'n': '95', 'o': '96', 'p': '97',
    'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
    'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
    'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
    'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
    'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
}.items()}


# Global variables

columns = None              # Terminal window size.
start = 0                   # The first result to display (option -s)
num = None                  # Number of results to display (option -n)
lang = None                 # Language to search for (option -l)
lucky = False               # If True, opens the first URL in browser (option -j)
colorize = True             # If True, colorizes the output (option -C)
colors = None               # Colors object, set after reading color settings (option --colors)
duration = None             # Time limit search (option -t) [e.g. h5, d5, w5, m5, y5]
conn = None                 # Use a single global connection during navigation
nav = 'n'                   # For user navigation
debug = False               # Print debug logs
news = False                # Read news
exact = False               # If True, disable automatic spelling correction
insite = None               # Google search a specific site
output_format = 0           # 0: Regular (normal) output, 1: Json output
noninteractive = False      # Non-interactive mode
server = "www.google.com"   # Default server to connect to
ua = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
      '(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240')
                            # Set user agent as Microsoft Edge
_VERSION_ = '2.4.1'         # Current version


# Classes

def annotate_tag(annotated_starttag_handler):
    # See parser logic within the GoogleParser class for documentation.
    #
    # In particular, search for "Ignore List" to view detailed
    # documentation of the ignore list.
    #
    # annotated_starttag_handler(self, tag: str, attrsdict: dict) -> annotation
    # Returns: HTMLParser.handle_starttag(self, tag: str, attrs: list) -> None

    def handler(self, tag, attrs):
        # Get context; assumes that the handler is called SCOPE_start
        context = annotated_starttag_handler.__name__[:-6]

        # If context is 'ignore', ignore all tests
        if context == 'ignore':
            self.insert_annotation(tag, None)
            return

        attrs = dict(attrs)

        # Compare against ignore list
        ignored = False
        for selector in self.IGNORE_LIST:
            for attr in selector:
                if attr == 'tag':
                    if tag != selector['tag']:
                        break
                elif attr == 'class':
                    tag_classes = set(self.classes(attrs))
                    selector_classes = set(self.classes(selector))
                    if not selector_classes.issubset(tag_classes):
                        break
                else:
                    if attrs[attr] != selector[attr]:
                        break
            else:
                # Passed all criterions of the selector
                ignored = True
                break

        # If tag matches ignore list, annotate and hand over to ignore_*
        if ignored:
            self.insert_annotation(tag, context + '_ignored')
            self.set_handlers_to('ignore')
            return

        # Standard
        annotation = annotated_starttag_handler(self, tag, attrs)
        self.insert_annotation(tag, annotation)

    return handler

def retrieve_tag_annotation(annotated_endtag_handler):
    # See parser logic within the GoogleParser class for documentation.
    #
    # annotated_endtag_handler(self, tag: str, annotation) -> None
    # Returns: HTMLParser.handle_endtag(self, tag: str) -> None

    def handler(self, tag):
        annotation = self.tag_annotations[tag].pop()
        annotated_endtag_handler(self, tag, annotation)

    return handler

class GoogleParser(HTMLParser.HTMLParser):
    """The members of this class parse the result
    HTML page fetched from Google server for a query.

    The custom parser looks for tags enclosing search
    results and extracts the URL, title and text for
    each search result.

    After parsing the complete HTML page results are
    returned in a list of objects of class Result.
    """

    # Parser logic:
    #
    # - Guiding principles:
    #
    #   1. Tag handlers are contextual;
    #
    #   2. Contexual starttag and endtag handlers should come in pairs
    #      and have a clear hierarchy;
    #
    #   3. Starttag handlers should only yield control to a pair of
    #      child handlers (that is, one level down the hierachy), and
    #      correspondingly, endtag handlers should only return control
    #      to the parent (that is, the pair of handlers that gave it
    #      control in the first place).
    #
    #   Principle 3 is meant to enforce a (possibly implicit) stack
    #   structure and thus prevent careless jumps that result in what's
    #   essentially spaghetti code with liberal use of GOTOs.
    #
    # - HTMLParser.handle_endtag gives us a bare tag name without
    #   context, which is not good for enforcing principle 3 when we
    #   have, say, nested div tags.
    #
    #   In order to precisely identify the matching opening tag, we
    #   maintain a stack for each tag name with *annotations*. Important
    #   opening tags (e.g., the ones where child handlers are
    #   registered) can be annotated so that when we can watch for the
    #   annotation in the endtag handler, and when the appropriate
    #   annotation is popped, we perform the corresponding action (e.g.,
    #   switch back to old handlers).
    #
    #   To facilitate this, each starttag handler is decorated with
    #   @annotate_tag, which accepts a return value that is the
    #   annotation (None by default), and additionally converts attrs to
    #   a dict, which is much easier to work with; and each endtag
    #   handler is decorated with @retrieve_tag_annotation which sends
    #   an additional parameter that is the retrieved annotation to the
    #   handler.
    #
    #   Note that some of our tag annotation stacks leak over time: this
    #   happens to tags like <img> and <hr> which are not
    #   closed. However, these tags play no structural role, and come
    #   only in small quantities, so it's not really a problem.
    #
    # - All textual data (result title, result abstract, etc.) are
    #   processed through a set of shared handlers. These handlers store
    #   text in a shared buffer self.textbuf which can be retrieved and
    #   cleared at appropriate times.
    #
    #   Data (including charrefs and entityrefs) are ignored initially,
    #   and when data needs to be recorded, the start_populating_textbuf
    #   method is called to register the appropriate data, charref and
    #   entityref handlers so that they append to self.textbuf. When
    #   recording ends, pop_textbuf should be called to extract the text
    #   and clear the buffer. stop_populating_textbuf returns the
    #   handlers to their pristine state (ignoring data).
    #
    #   Methods:
    #   - start_populating_textbuf(self, data_transformer: Callable[[str], str]) -> None
    #   - pop_textbuf(self) -> str
    #   - stop_populating_textbuf(self) -> None
    #
    # - Outermost starttag and endtag handler methods: main_*. The whole
    #   parser starts and ends in this state.
    #
    # - Each result is wrapped in a <div> tag with class "g".
    #
    #   <!-- within the scope of main_* -->
    #   <div class="g">  <!-- annotate as 'result', hand over to result_* -->
    #   </div>           <!-- hand back to main_*, register result -->
    #
    # - For each result, the first <h3> tag with class "r" contains the
    #   hyperlinked title, and the (optional) first <div> tag with class
    #   "s" contains the abstract of the result.
    #
    #   <!-- within the scope of result_* -->
    #   <h3 class="r">   <!-- annotate as 'title', hand over to title_* -->
    #   </h3>            <!-- hand back to result_* -->
    #   <div class="s">  <!-- annotate as 'abstract', hand over to abstract_* -->
    #   </div>           <!-- hand back to result_* -->
    #
    # - Each title looks like
    #
    #   <h3 class="r">
    #     <!-- within the scope of title_* -->
    #     <span>                 <!-- filetype (optional), annotate as title_filetype,
    #                                 start_populating_textbuf -->
    #       file type (e.g. [PDF])
    #     </span>                <!-- stop_populating_textbuf -->
    #     <a href="result url">  <!-- register self.url, annotate as 'title_link',
    #                                 start_populating_textbuf -->
    #       result title
    #     </a>                   <!-- stop_populating_textbuf, pop to self.title -->
    #   </h3>
    #
    # - For each abstract, the first <span> tag with class "st" contains
    #   the body text of the abstract.
    #
    #   <!-- within the scope of abstract_* -->
    #   <span class="st">  <!-- annotate as 'abstract_text', start_populating_textbuf -->
    #     abstract text with <em> markup on keywords
    #   </span>            <!-- stop_populating_textbuf, pop to self.abstract -->
    #
    # - Certain results may come with sitelinks, secondary results that
    #   are usually subdomains or deep links within the primary
    #   result. They are organized into a <table> tag, and each sitelink
    #   is in a separate <td>:
    #
    #   <!-- within the scope of result_* -->
    #   <table>    <!-- annotate as 'sitelink_table', hand over to sitelink_table_* -->
    #     <tr>
    #       <td>   <!-- annotate as 'sitelink', hand over to sitelink_* -->
    #       </td>  <!-- append to self.sitelinks, hand back to sitelink_table_* -->
    #       <td></td>
    #       ...
    #     </tr>
    #     <tr></tr>
    #     ...
    #   </table>   <!-- hand back to result_* -->
    #
    #   Then for each sitelink, the hyperlinked title is in an <h3> tag
    #   with class "r", and the abstract is in a <div> tag with class
    #   "st". They are not necessarily on the same level, but we don't
    #   really care.
    #
    #   <!-- within the scope of sitelink_* -->
    #   <h3 class="r">             <!-- annotate as 'sitelink_title',
    #                                   hand over to sitelink_title_* -->
    #     <a href="sitelink url">  <!-- register sitelink url, annotate as 'sitelink_title_link',
    #                                   start_populating_textbuf -->
    #       sitelink title
    #     </a>                     <!-- stop_populating_textbuf, pop to sitelink title -->
    #   </h3>                      <!-- hand back to sitelink_* -->
    #
    #   <!-- still within the scope of sitelink_* -->
    #   <div class="st">  <!-- annotate as 'sitelink_abstract', start_populating_textbuf -->
    #     abstract text
    #   </div>            <!-- stop_populating_textbuf, pop to sitelink abstract -->
    #
    #
    # Google News
    #
    # - Google News results differ from Google Search results in the
    #   following ways:
    #
    #   For each result, the title in the same format, but there's a
    #   metadata field in a <div> tag with class "slp", and the abstract
    #   isn't as deeply embedded: it's in a <div> tag on the same level
    #   with class "st".
    #
    #   <!-- within the scope of result_* -->
    #   <h3 class="r"></h3>  <!-- as before -->
    #   <div class="slp">    <!-- annotate as 'news_metadata', start_populating_textbuf -->
    #     ...
    #     <span>source</span>
    #     <span>-</span>     <!-- transform to ', ' -->
    #     <span>publishing time</span>
    #   </div>               <!-- stop_populating_textbuf, pop to self.metadata -->
    #   <div class="st">     <!-- annotate as 'news_abstract', start_populating_textbuf -->
    #     abstract text again with <em> markup on keywords
    #   </div>               <!-- stop_populating_textbuf, pop to self.abstract -->
    #
    #
    # Ignore List
    #
    # - As good as our result criterions might be, sometimes results of
    #   dubious value (usually from Google's value-add features) slip
    #   through. The "People also ask" feature is a good example of this
    #   type (a sample query is "VPN"; see screenshot
    #   https://i.imgur.com/yfcsoQz.png). In these cases, we may want to
    #   skip enclosing containers entirely. The ignore list feature is
    #   designed for this purpose.
    #
    #   The current ignore list is available in self.IGNORE_LIST. Each
    #   entry (called a "selector") is a dict of attribute-value
    #   pairs. Each attribute is matched verbatim to a tag's attribute,
    #   except the "class" attribute, where we test for inclusion
    #   instead (e.g. "c b a" matches "a b", just like it matches the
    #   CSS selector ".a.b"). There's also a special "attribute" -- tag,
    #   the meaning of which is obvious. A tag has to match all given
    #   attributes to be considered a match for the selector.
    #
    #   When a match is found, the tag is annotated as SCOPE_ignored,
    #   where SCOPE is the current handler scope (e.g., main, result,
    #   title, etc.), and the scope is switched to 'ignore'. All
    #   descendants of the tag are ignored. When the corresponding end
    #   tag is finally reach, the former scope is restored.

    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)

        self.results = []

        self.index = 0
        self.textbuf = ''
        self.tag_annotations = {}

        self.set_handlers_to('main')

    ### Ignore list ###
    IGNORE_LIST = [
        # "People also ask"
        # Sample query: VPN
        # Screenshot: https://i.imgur.com/yfcsoQz.png
        {
            'tag': 'div',
            'class': 'related-question-pair'
        }
    ]

    ### Tag handlers ###

    @annotate_tag
    def main_start(self, tag, attrs):
        # We omit "card" results which usually have a class list along
        # the line of "g mnr-c g-blk".
        if tag == 'div' and attrs.get('class') == 'g':
            # Initialize result field registers
            self.title = ''
            self.url = ''
            self.abstract = ''
            self.metadata = '' # Only used for Google News
            self.sitelinks = []

            # Guard against sitelinks, which also have titles and
            # abstracts.  In the case of news, guard against "card
            # sections" (secondary results to the same event).
            self.title_registered = False
            self.abstract_registered = False
            self.metadata_registered = False # Only used for Google News

            self.set_handlers_to('result')
            return 'result'

    @retrieve_tag_annotation
    def main_end(self, tag, annotation):
        pass

    @annotate_tag
    def result_start(self, tag, attrs):
        if not self.title_registered and tag == 'h3' and 'r' in self.classes(attrs):
            self.set_handlers_to('title')
            return 'title'

        if not self.abstract_registered and tag == 'div' and 's' in self.classes(attrs):
            self.set_handlers_to('abstract')
            return 'abstract'

        if not self.sitelinks and tag == 'table':
            self.set_handlers_to('sitelink_table')
            return 'sitelink_table'

        global news
        if news:
            if not self.metadata_registered and tag == 'div' and 'slp' in self.classes(attrs):
                # Change metadata field separator from '-' to ', ' for better appearance
                self.start_populating_textbuf(lambda text: ', ' if text == '-' else text)
                return 'news_metadata'

            if not self.abstract_registered and tag == 'div' and 'st' in self.classes(attrs):
                self.start_populating_textbuf()
                return 'news_abstract'

    @retrieve_tag_annotation
    def result_end(self, tag, annotation):
        if annotation == 'result':
            if self.url:
                self.index += 1
                result = Result(self.index, self.title, self.url, self.abstract,
                                metadata=self.metadata if self.metadata else None,
                                sitelinks=self.sitelinks)
                self.results.append(result)
            self.set_handlers_to('main')
        elif annotation == 'news_metadata':
            self.stop_populating_textbuf()
            self.metadata = self.pop_textbuf()
            self.metadata_registered = True
        elif annotation == 'news_abstract':
            self.stop_populating_textbuf()
            self.abstract = self.pop_textbuf()
            self.abstract_registered = True

    @annotate_tag
    def title_start(self, tag, attrs):
        if tag == 'span':
            # Print a space after the filetype indicator
            self.start_populating_textbuf(lambda text: text + ' ')
            return 'title_filetype'
        if tag == 'a' and 'href' in attrs:
            self.url = attrs['href']
            self.start_populating_textbuf()
            return 'title_link'

    @retrieve_tag_annotation
    def title_end(self, tag, annotation):
        if annotation == 'title_filetype':
            self.stop_populating_textbuf()
        elif annotation == 'title_link':
            self.stop_populating_textbuf()
            self.title = self.pop_textbuf()
            self.title_registered = True
        elif annotation == 'title':
            self.set_handlers_to('result')

    @annotate_tag
    def abstract_start(self, tag, attrs):
        if tag == 'span' and 'st' in self.classes(attrs):
            self.start_populating_textbuf()
            return 'abstract_text'

    @retrieve_tag_annotation
    def abstract_end(self, tag, annotation):
        if annotation == 'abstract_text':
            self.stop_populating_textbuf()
            self.abstract = self.pop_textbuf()
            self.abstract_registered = False
        elif annotation == 'abstract':
            self.set_handlers_to('result')

    @annotate_tag
    def sitelink_table_start(self, tag, attrs):
        if tag == 'td':
            # Initialize a new sitelink
            self.current_sitelink = Sitelink('', '', '')
            self.set_handlers_to('sitelink')
            return 'sitelink'

    @retrieve_tag_annotation
    def sitelink_table_end(self, tag, annotation):
        if annotation == 'sitelink_table':
            self.set_handlers_to('result')

    @annotate_tag
    def sitelink_start(self, tag, attrs):
        if tag == 'h3' and 'r' in self.classes(attrs):
            self.set_handlers_to('sitelink_title')
            return 'sitelink_title'
        if tag == 'div' and 'st' in self.classes(attrs):
            self.start_populating_textbuf()
            return 'sitelink_abstract'

    @retrieve_tag_annotation
    def sitelink_end(self, tag, annotation):
        if annotation == 'sitelink_abstract':
            self.stop_populating_textbuf()
            self.current_sitelink.abstract = self.pop_textbuf()
        elif annotation == 'sitelink':
            if self.current_sitelink.url:
                self.sitelinks.append(self.current_sitelink)
            self.set_handlers_to('sitelink_table')

    @annotate_tag
    def sitelink_title_start(self, tag, attrs):
        if tag == 'a' and 'href' in attrs:
            self.current_sitelink.url = attrs['href']
            self.start_populating_textbuf()
            return 'sitelink_title_link'

    @retrieve_tag_annotation
    def sitelink_title_end(self, tag, annotation):
        if annotation == 'sitelink_title_link':
            self.stop_populating_textbuf()
            self.current_sitelink.title = self.pop_textbuf()
        elif annotation == 'sitelink_title':
            self.set_handlers_to('sitelink')

    ### Generic methods ###

    # Set handle_starttag to SCOPE_start, and handle_endtag to SCOPE_end.
    def set_handlers_to(self, scope):
        self.handle_starttag = getattr(self, scope + '_start')
        self.handle_endtag = getattr(self, scope + '_end')

    def insert_annotation(self, tag, annotation):
        if tag not in self.tag_annotations:
            self.tag_annotations[tag] = []
        self.tag_annotations[tag].append(annotation)

    @annotate_tag
    def ignore_start(self, tag, attrs):
        pass

    @retrieve_tag_annotation
    def ignore_end(self, tag, annotation):
        if annotation and annotation.endswith('_ignored'):
            # Strip '-ignore' suffix from annotation to obtain the outer
            # context name.
            context = annotation[:-8]
            self.set_handlers_to(context)

    def start_populating_textbuf(self, data_transformer=None):
        if data_transformer is None:
            # Record data verbatim
            self.handle_data = self.record_data
        else:
            def record_transformed_data(data):
                self.textbuf += data_transformer(data)

            self.handle_data = record_transformed_data

        self.handle_entityref = self.record_entityref
        self.handle_charref = self.record_charref

    def pop_textbuf(self):
        text = self.textbuf
        self.textbuf = ''
        return text

    def stop_populating_textbuf(self):
        self.handle_data = lambda data: None
        self.handle_entityref = lambda ref: None
        self.handle_charref = lambda ref: None

    def record_data(self, data):
        self.textbuf += data

    def record_entityref(self, ref):
        try:
            self.textbuf += unichr(name2codepoint[ref])
        except KeyError:
            # Entity name not found; most likely rather sloppy HTML
            # where a literal ampersand is not escaped; For instance,
            # the HTML response returned by
            #
            #     googler -c au -l ko expected
            #
            # contains the following tag
            #
            #     <p class="_e4b"><a href="...">expected market return s&p 500</a></p>
            #
            # where &p is interpreted by HTMLParser as an entity (this
            # behaviour seems to be specific to Python 2.7).
            self.textbuf += '&' + ref

    def record_charref(self, ref):
        if ref.startswith('x'):
            char = unichr(int(ref[1:], 16))
        else:
            char = unichr(int(ref))
        self.textbuf += char

    @staticmethod
    def classes(attrs):
        """Get tag's classes from its attribute dict."""
        return attrs.get('class', '').split()


class Sitelink:
    """Container for a sitelink's title, URL and abstract."""

    def __init__(self, title, url, abstract):
        self.title = title
        self.url = url
        self.abstract = abstract


class Result:
    """Encapsulates a search result's index, title, URL and abstract
    (and optionally metadata for Google News).
    """

    def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None):
        self.index = index
        self.title = title
        self.url = url
        self.abstract = abstract
        self.metadata = metadata
        self.sitelinks = [] if sitelinks is None else sitelinks

    @staticmethod
    def print_title_and_url(index, title, url, indent=0):
        global colorize, colors
        # Pad index and url with `indent` number of spaces
        index = " " * indent + str(index)
        url = " " * indent + url
        if colorize:
            print(colors.index + index + colors.reset, end="")
            print(' ' + colors.title + title + colors.reset)
            print(colors.url + url + colors.reset)
        else:
            print(" %s %s\n%s" % (index, title, url))

    @staticmethod
    def print_metadata_and_abstract(abstract, metadata=None, indent=0):
        global colorize, colors, columns
        if metadata:
            if colorize:
                print(colors.metadata + metadata + colors.reset)
            else:
                print(metadata)

        if colorize:
            print(colors.abstract, end="")
        if columns > indent + 1:
            # Try to fill to columns
            fillwidth = columns - indent - 1
            for line in textwrap.wrap(abstract, width=fillwidth):
                print("%s%s" % (" " * indent, line))
            print("")
        else:
            print("%s\n" % abstract.replace("\n", " "))
        if colorize:
            print(colors.reset, end="")

    def print_entry(self):
        """Print an entry and returns an URL index.

        The URL "index" is a dict of index: URL pairs (here index is the
        result index, e.g., "1", "1a", etc.).

        The URL index usually contains a single element, but
        occasionally there could be more due to sitelinks.
        """

        index = self.index
        title = self.title
        url = self.url
        metadata = self.metadata
        abstract = self.abstract
        sitelinks = self.sitelinks

        index_url_map = {}

        # Open the URL in a web browser if option -j was specified.
        if lucky:
            self.open()
            quit(conn)

        index_url_map[str(index)] = url
        self.print_title_and_url(index, title, url)
        self.print_metadata_and_abstract(abstract, metadata=metadata)

        subindex = 'a'
        for sitelink in sitelinks:
            fullindex = str(index) + subindex
            self.print_title_and_url(fullindex, sitelink.title, sitelink.url, indent=4)
            self.print_metadata_and_abstract(sitelink.abstract, indent=4)
            index_url_map[fullindex] = sitelink.url
            # Increment subindex
            subindex = chr(ord(subindex) + 1)

        return index_url_map

    # Open the URL in browser
    def open(self):
        open_url(self.url)

    # Returns an object (dict) good for JSON serialization
    def json_object(self):
        obj = {
            'title': self.title,
            'url': self.url,
            'abstract': self.abstract
        }
        if self.metadata:
            obj['metadata'] = self.metadata
        if self.sitelinks:
            obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
        return obj


Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')


# Functions

def server_url(tld):
    """Data source: https://en.wikipedia.org/wiki/List_of_Google_domains
    Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
    """

    tld_to_domain_map = {
        'ac': 'google.ac',      'ad': 'google.ad',      'ae': 'google.ae',
        'af': 'google.com.af',  'ag': 'google.com.ag',  'ai': 'google.com.ai',
        'al': 'google.al',      'am': 'google.am',      'ao': 'google.co.ao',
        'ar': 'google.com.ar',  'as': 'google.as',      'at': 'google.at',
        'au': 'google.com.au',  'az': 'google.az',      'ba': 'google.ba',
        'bd': 'google.com.bd',  'be': 'google.be',      'bf': 'google.bf',
        'bg': 'google.bg',      'bh': 'google.com.bh',  'bi': 'google.bi',
        'bj': 'google.bj',      'bn': 'google.com.bn',  'bo': 'google.com.bo',
        'br': 'google.com.br',  'bs': 'google.bs',      'bt': 'google.bt',
        'bw': 'google.co.bw',   'by': 'google.by',      'bz': 'google.com.bz',
        'ca': 'google.ca',      'cat': 'google.cat',    'cc': 'google.cc',
        'cd': 'google.cd',      'cf': 'google.cf',      'cg': 'google.cg',
        'ch': 'google.ch',      'ci': 'google.ci',      'ck': 'google.co.ck',
        'cl': 'google.cl',      'cm': 'google.cm',      'cn': 'google.cn',
        'co': 'google.com.co',  'cr': 'google.co.cr',   'cu': 'google.com.cu',
        'cv': 'google.cv',      'cy': 'google.com.cy',  'cz': 'google.cz',
        'de': 'google.de',      'dj': 'google.dj',      'dk': 'google.dk',
        'dm': 'google.dm',      'do': 'google.com.do',  'dz': 'google.dz',
        'ec': 'google.com.ec',  'ee': 'google.ee',      'eg': 'google.com.eg',
        'es': 'google.es',      'et': 'google.com.et',  'fi': 'google.fi',
        'fj': 'google.com.fj',  'fm': 'google.fm',      'fr': 'google.fr',
        'ga': 'google.ga',      'ge': 'google.ge',      'gf': 'google.gf',
        'gg': 'google.gg',      'gh': 'google.com.gh',  'gi': 'google.com.gi',
        'gl': 'google.gl',      'gm': 'google.gm',      'gp': 'google.gp',
        'gr': 'google.gr',      'gt': 'google.com.gt',  'gy': 'google.gy',
        'hk': 'google.com.hk',  'hn': 'google.hn',      'hr': 'google.hr',
        'ht': 'google.ht',      'hu': 'google.hu',      'id': 'google.co.id',
        'ie': 'google.ie',      'il': 'google.co.il',   'im': 'google.im',
        'in': 'google.co.in',   'io': 'google.io',      'iq': 'google.iq',
        'is': 'google.is',      'it': 'google.it',      'je': 'google.je',
        'jm': 'google.com.jm',  'jo': 'google.jo',      'jp': 'google.co.jp',
        'ke': 'google.co.ke',   'kg': 'google.kg',      'kh': 'google.com.kh',
        'ki': 'google.ki',      'kr': 'google.co.kr',   'kw': 'google.com.kw',
        'kz': 'google.kz',      'la': 'google.la',      'lb': 'google.com.lb',
        'lc': 'google.com.lc',  'li': 'google.li',      'lk': 'google.lk',
        'ls': 'google.co.ls',   'lt': 'google.lt',      'lu': 'google.lu',
        'lv': 'google.lv',      'ly': 'google.com.ly',  'ma': 'google.co.ma',
        'md': 'google.md',      'me': 'google.me',      'mg': 'google.mg',
        'mk': 'google.mk',      'ml': 'google.ml',      'mm': 'google.com.mm',
        'mn': 'google.mn',      'ms': 'google.ms',      'mt': 'google.com.mt',
        'mu': 'google.mu',      'mv': 'google.mv',      'mw': 'google.mw',
        'mx': 'google.com.mx',  'my': 'google.com.my',  'mz': 'google.co.mz',
        'na': 'google.com.na',  'ne': 'google.ne',      'nf': 'google.com.nf',
        'ng': 'google.com.ng',  'ni': 'google.com.ni',  'nl': 'google.nl',
        'no': 'google.no',      'np': 'google.com.np',  'nr': 'google.nr',
        'nu': 'google.nu',      'nz': 'google.co.nz',   'om': 'google.com.om',
        'pa': 'google.com.pa',  'pe': 'google.com.pe',  'pg': 'google.com.pg',
        'ph': 'google.com.ph',  'pk': 'google.com.pk',  'pl': 'google.pl',
        'pn': 'google.co.pn',   'pr': 'google.com.pr',  'ps': 'google.ps',
        'pt': 'google.pt',      'py': 'google.com.py',  'qa': 'google.com.qa',
        'ro': 'google.ro',      'rs': 'google.rs',      'ru': 'google.ru',
        'rw': 'google.rw',      'sa': 'google.com.sa',  'sb': 'google.com.sb',
        'sc': 'google.sc',      'se': 'google.se',      'sg': 'google.com.sg',
        'sh': 'google.sh',      'si': 'google.si',      'sk': 'google.sk',
        'sl': 'google.com.sl',  'sm': 'google.sm',      'sn': 'google.sn',
        'so': 'google.so',      'sr': 'google.sr',      'st': 'google.st',
        'sv': 'google.com.sv',  'td': 'google.td',      'tg': 'google.tg',
        'th': 'google.co.th',   'tj': 'google.com.tj',  'tk': 'google.tk',
        'tl': 'google.tl',      'tm': 'google.tm',      'tn': 'google.tn',
        'to': 'google.to',      'tr': 'google.com.tr',  'tt': 'google.tt',
        'tw': 'google.com.tw',  'tz': 'google.co.tz',   'ua': 'google.com.ua',
        'ug': 'google.co.ug',   'uk': 'google.co.uk',   'uy': 'google.com.uy',
        'uz': 'google.co.uz',   'vc': 'google.com.vc',  've': 'google.co.ve',
        'vg': 'google.vg',      'vi': 'google.co.vi',   'vn': 'google.com.vn',
        'vu': 'google.vu',      'ws': 'google.ws',      'za': 'google.co.za',
        'zm': 'google.co.zm',   'zw': 'google.co.zw',
    }

    try:
        # Use www subdomain
        return 'www.' + tld_to_domain_map[tld]
    except KeyError:
        return 'www.google.com'


def google_get(conn, url):
    """Send a GET request to Google with the appropriate headers.
    url can be relative (to the appropriate Google domain).
    """

    global ua
    conn.request("GET", url, None, {
        "Accept-encoding": "gzip",
        "User-Agent": ua,
    })
    return conn.getresponse()


def new_connection(domain=None):
    """Returns a new connection to the given domain with appropriate options.
    When the given domain is absent, the global variable server is used instead.
    """

    global server
    return HTTPSConnection(domain if domain else server, timeout=45)


def quit(conn):
    """Closes a connection and quits the program"""

    conn.close()
    sys.exit(1)


def fetch_results():
    """Connect to server and fetch results"""

    global conn
    global url

    try:
        resp = google_get(conn, url)
    except Exception as e:
        debugp("Exception: %s" % e)
        conn.close()
        conn = new_connection()
        resp = google_get(conn, url)

    if resp.status != 200:
        if resp.status in (301, 302,):
            url = resp.getheader('location', '')
            debugp("Redirected URL [%s]" % url)
            if url.find("sorry/IndexRedirect?") >= 0:
                error("Connection blocked due to unusual activity.")
                quit(conn)

            conn.close()
            mark = url.find("/search")
            next_server = url[(url.find("//") + 2):mark]
            debugp("Next Server [%s]" % next_server)
            conn = new_connection(next_server)
            url = url[mark:]
            debugp("Next GET [%s]" % url)

            try:
                resp = google_get(conn, url)
            except Exception as e:
                debugp("Exception: %s" % e)
                quit(conn)

            if resp.status != 200:
                # Failed connecting to redirected server too!
                error("First redirection failed with HTTP %d: %s" %
                      (resp.status, resp.reason))
                quit(conn)
        else:
            # The server responded with an error.
            error("HTTP %d: %s" % (resp.status, resp.reason))
            quit(conn)

    # Parse the HTML document and print the results.
    parser = GoogleParser()

    if not debug:
        parser.feed(gzip.GzipFile(fileobj=io.BytesIO(resp.read())).read().decode('utf-8'))
    else:
        resp_body = gzip.GzipFile(fileobj=io.BytesIO(resp.read())).read().decode('utf-8')
        fd, tmpfile = tempfile.mkstemp(prefix='googler-response-')
        os.close(fd)
        with open(tmpfile, 'wb') as fp:
            fp.write(resp_body.encode('utf-8'))
        debugp("Response body written to '%s'.\n" % tmpfile)
        parser.feed(resp_body)

    results = parser.results
    urlindex = {}
    if (len(results) > 0):      # print a newline for more clarity
        printerr("")

    if output_format == 0:          # Regular output
        for r in results:
            urlindex.update(r.print_entry())
    elif output_format == 1:        # Json output
        results_object = [r.json_object() for r in results]
        print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))

    return results, urlindex


def show_omniprompt():
    """Show the search or navigation omniprompt."""

    global colorize, colors

    message = "googler (? for help)"
    if colorize:
        return raw_input(colors.prompt + message + colors.reset + ' ')
    else:
        return raw_input("%s: " % message)


def construct_search_url(baseurl, keywords, site=None):
    """Construct the search URL with keywords and optionally a site.

    baseurl should already contain queries such as start and num. This
    function only adds the q query.

    keywords should either be a list of keywords, or a single
    space-delimited string of keywords.
    """
    url = baseurl
    if type(keywords) == list:
        url += "q=" + "+".join(map(url_quote_plus, keywords))
    else:
        url += "q=" + url_quote_plus(keywords)
    if site:
        url += "+site:" + url_quote_plus(site)
    debugp("Search URL [%s : %s]" % (server, url))
    return url


def open_url(url):
    _stderr = os.dup(2)
    os.close(2)
    _stdout = os.dup(1)
    os.close(1)
    fd = os.open(os.devnull, os.O_RDWR)
    os.dup2(fd, 2)
    os.dup2(fd, 1)
    try:
        webbrowser.open(url)
    finally:
        os.close(fd)
        os.dup2(_stderr, 2)
        os.dup2(_stdout, 1)


# Messaging wrappers

def message(msg, level="INFO", ansi=None):
    """Print message, led with [%(level)s] where level is a string, to
    stderr.  If ansi is not None and the global variable colorize is True,
    then print ansi at the beginning, and close with \x1b[0m.
    """

    global colorize

    if ansi is not None and colorize:
        opening = ansi
        closing = "\x1b[0m"
    else:
        opening = closing = ""
    print("{opening}[{level}] {msg}{closing}".format(
        opening=opening, level=level, msg=msg, closing=closing
    ), file=sys.stderr)


def error(msg):
    """Print error message.
    Error in red if colorize.
    """

    message(msg, "ERROR", "\x1b[31m")


def warning(msg):
    """Print warning message.
    Warning in yellow if colorize
    """

    message(msg, "WARNING", "\x1b[33m")


def debugp(msg):
    """Print debug message.
    debugp stands for "debug print".
    """

    global debug

    if debug:
        # Debugging messages not colorized
        message(msg, "DEBUG")


def printerr(msg):
    """Print message, verbatim, to stderr."""

    print(msg, file=sys.stderr)


# Program Main

# Process command-line options.
keywords = None

class ExtendedArgumentParser(argparse.ArgumentParser):
    """Custom argument parser for googler,
    extends classic ArgumentParser
    """

    # Print omniprompt help
    @staticmethod
    def print_omniprompt_help(file=None):
        file.write(textwrap.dedent("""
        omniprompt keys:
          n, p                  fetch the next or previous set of search results
          index                 open the result corresponding to index in browser
          f                     jump to the first page
          o                     open the current search in browser
          g keywords            initiate a new Google search for 'keywords' with original options
          q, ^D, double Enter   exit googler
          ?                     show omniprompt help
          *                     any other string initiates a new search with original options
        """))

    # Print information on googler
    @staticmethod
    def print_general_info(file=None):
        file.write(textwrap.dedent("""
        Version %s
        Copyright (C) 2008 Henri Hakkinen
        Copyright (C) 2015-2016 Arun Prakash Jana <engineerarun@gmail.com>
        Zhiming Wang <zmwangx@gmail.com>
        License: GPLv3
        Webpage: https://github.com/jarun/googler
        """ % _VERSION_))

    # Augment print_help to print more than synopsis and options
    def print_help(self, file=None):
        super(ExtendedArgumentParser, self).print_help(file)
        self.print_omniprompt_help(file)
        self.print_general_info(file)

    # Automatically print full help text on error
    def error(self, message):
        sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
        self.print_help(sys.stderr)
        self.exit(2)

def is_duration(arg):
    """Check if a string is a valid duration accepted by Google.

    A valid duration is of the form dNUM, where d is a single letter h
    (hour), d (day), w (week), m (month), or y (year), and NUM is a
    non-negative integer.
    """
    try:
        if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
            raise ValueError
    except (TypeError, IndexError, ValueError):
        raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
    return arg

def is_colorstr(arg):
    """Check if a string is a valid color string."""
    try:
        assert len(arg) == 6
        for c in arg:
            assert c in COLORMAP
    except AssertionError:
        raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
    return arg

# Retrieve GOOGLER_COLORS
colorstr_env = os.getenv('GOOGLER_COLORS')

argparser = ExtendedArgumentParser(
    add_help=False,
    description='Google from the command-line.'
)
addarg = argparser.add_argument
addarg('-s', '--start', dest='start', type=int, metavar='N',
       help='start at the Nth result')
addarg('-n', '--count', dest='num', type=int, metavar='N',
       help='show N results (default 10)')
addarg('-N', '--news', dest='news', action='store_true',
       help='show results from news section')
addarg('-c', '--tld', dest='tld', metavar='TLD',
       help="""country-specific search with top-level domain .TLD, e.g., 'in'
       for India. Ref: https://en.wikipedia.org/wiki/List_of_Google_domains""")
addarg('-l', '--lang', dest='lang', metavar='LANG',
       help='display in language LANG')
addarg('-x', '--exact', dest='exact', action='store_true',
       help='disable automatic spelling correction')
addarg('-C', '--nocolor', dest='colorize', action='store_false',
       help='disable color output')
addarg('--colors', dest='colorstr', default=colorstr_env if colorstr_env else 'GKlxxy',
       type=is_colorstr, metavar='COLORS',
       help='set output colors (see man page for details)')
addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
       help='open the first result in a web browser')
addarg('-t', '--time', dest='duration', type=is_duration, metavar='dN',
       help='time limit search '
       '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
addarg('-w', '--site', dest='insite', metavar='SITE',
       help='search a site using Google')
addarg('--json', dest='json', action='store_true',
       help='output in JSON format; implies --noprompt')
addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
       help='perform search and exit, do not prompt for further interactions')
addarg('-d', '--debug', dest='debug', action='store_true',
       help='enable debugging')
addarg('keywords', nargs='+', metavar='KEYWORD',
       help='search keywords')

if len(sys.argv) < 2:
    argparser.print_help(sys.stderr)
    sys.exit(1)

# Set global variables
args = argparser.parse_args()
if args.start:
    if args.start > 0:
        start = args.start
if args.num:
    if args.num > 0:
        num = args.num
news = args.news
if args.tld:
    server = server_url(args.tld)
lang = args.lang
exact = args.exact
colorize = args.colorize
colors = Colors(*map(lambda c: COLORMAP[c], args.colorstr), reset=COLORMAP['x'])
lucky = args.lucky
duration = args.duration
if args.debug:
    debug = args.debug
    import tempfile
keywords = args.keywords
insite = args.insite
noninteractive = args.noninteractive
if args.json:
    output_format = 1
    noninteractive = True
    import json

# Get terminal width
columns, _ = get_terminal_size()

debugp("Version %s" % _VERSION_)

# Construct the query URL.
url = "/search?ie=UTF-8&oe=UTF-8&"

url += "start=" + str(start) + "&"
if num is not None:
    url += "num=" + str(num) + "&"
if news:
    url += "tbm=nws&"
if lang is not None:
    url += "hl=" + lang + "&"
if duration is not None:
    url += "tbs=qdr:" + duration + "&"
if exact:
    url += "nfpr=1&"

baseurl = url
basestart = start

debugp("Base URL [%s]" % url)

url = construct_search_url(baseurl, keywords, site=insite)

# Connect to Google and request the result page.
conn = new_connection()

results = []
urlindex = {}
while True:
    if nav == "n" or nav == "p" or nav == "f" or nav == "g":
        results, urlindex = fetch_results()

    if noninteractive:
        break

    oldstart = start
    try:
        nav = show_omniprompt()
    except EOFError:
        break

    if not nav:
        try:
            nav = show_omniprompt()
        except EOFError:
            break
        if not nav:
            # Two consecutive enters
            break

    if nav == "n":
        if len(results) == 0:
            nav = ""
            continue
        if num is not None:
            start = start + num
        else:
            start = start + 10

        url = url.replace("start=" + str(oldstart) + "&", "start=" + str(start) + "&", 1)
        debugp("Next URL [%s]\n" % url)
    elif nav == "p" or nav == "f":
        if start == 0:
            printerr("Already at the first page.")
            nav = "" # Unset nav so that we don't fetch result in the next iteration
            continue

        if nav == "p":
            if num is not None:
                newstart = start - num
            else:
                newstart = start - 10

            if newstart >= 0:
                start = newstart
            else:
                start = 0
        else:
            start = 0

        url = url.replace("start=" + str(oldstart) + "&", "start=" + str(start) + "&", 1)
        debugp("Next URL [%s]\n" % url)
    elif nav == "o":
        open_url("https://" + server + url)
    elif nav == "q":
        break
    elif nav == "?":
        ExtendedArgumentParser.print_omniprompt_help(sys.stderr)
        printerr("")
    elif nav.startswith("g "):
        keywords = nav[2:].strip()
        if not keywords:
            nav = ""
            continue
        url = construct_search_url(baseurl, keywords, site=insite)
        nav = "g"
        start = basestart
    elif nav in urlindex:
        open_url(urlindex[nav])
    elif nav.isdigit() and int(nav) < 100:
        printerr("Index out of bound. To search for the number, use g.")
    elif nav:
        keywords = nav.strip()
        if not keywords:
            nav = ""
            continue
        url = construct_search_url(baseurl, keywords, site=insite)
        nav = "g"
        start = basestart


conn.close()
