Source code for omnipresence.plugins.wwwjdic

# -*- coding: utf-8
# -*- test-case-name: omnipresence.plugins.wwwjdic.test_wwwjdic
"""Event plugins for searching WWWJDIC."""


import re
import urllib

from twisted.internet.defer import inlineCallbacks, returnValue
from twisted.web.client import readBody
try:
    from waapuro import romanize
except ImportError:
    romanize = None

from ...message import collapse
from ...plugin import EventPlugin, UserVisibleError
from ...web.html import parse as parse_html
from ...web.http import default_agent


#: A regex for identifying pronunciations in a JDIC entry, if present.
PRONUNCIATIONS_RE = re.compile(ur'\[([^\]]+)\]')

#: A regex for identifying markings at the end of a kana pronunciation.
MARKINGS_RE = re.compile(ur'(?:\([^)]+\))+$')


[docs]class Default(EventPlugin):
    u"""Define a Japanese word or phrase using `Jim Breen's WWWJDIC`__.

    __ http://wwwjdic.org/

    If `Waapuro`__ is installed, Nihon-shiki romanizations are provided
    alongside the kana spellings.

    __ https://pypi.python.org/pypi/waapuro

    :alice: wwwjdic kotoba
    :bot: 言葉(P);詞;辞 [ことば (kotoba) (P); けとば (ketoba) (言葉)(ok)] (n)
          (1) (See 言語) language; dialect;
          (2) (See 単語) word; words; phrase; term; expression; remark;
          (3) speech; (manner of) speaking; (P) (+28 more)
    """

    def __init__(self):
        self.agent = default_agent
        self.romanize = romanize

    @inlineCallbacks
    def on_command(self, msg):
        if not msg.content:
            raise UserVisibleError('Please specify a search query.')
        q = urllib.quote_plus(msg.content)
        response = yield self.agent.request('GET',
            'http://www.edrdg.org/cgi-bin/wwwjdic/wwwjdic?1ZUJ{}'.format(q))
        content = yield readBody(response)
        soup = parse_html(content)
        results = []
        if not soup.pre:
            returnValue(results)
        for result in soup.pre.string.strip().splitlines():
            if not result.strip():
                continue
            # Find the kana pronunciations and add their romanizations.
            if self.romanize:
                match = PRONUNCIATIONS_RE.search(result)
                if match is None:
                    pronunciations = result.split(None, 1)[0]
                    start = 0
                    end = len(pronunciations)
                else:
                    pronunciations = match.group(1)
                    start = match.start(1)
                    end = match.end(1)
                pronunciations = pronunciations.split(u';')
                with_romanizations = []
                for pronunciation in pronunciations:
                    match = MARKINGS_RE.search(pronunciation)
                    if match is not None:
                        pronunciation = pronunciation[:match.start()]
                    with_romanizations.append(
                        pronunciation +
                        u' (' + self.romanize(pronunciation) + u')' +
                        (u'' if match is None else u' ' + match.group(0)))
                result = (result[:start] +
                          u'; '.join(with_romanizations) +
                          result[end:])
            # Strip off the trailing slash for the last gloss, then
            # replace the first slash with nothing and the remaining
            # ones with semicolons, in an approximation of the Web
            # interface.
            result = result[:-1].strip()
            result = result.replace(u'/', u'', 1)
            result = result.replace(u'/', u'; ')
            results.append(result)
        returnValue(results)

    def on_cmdhelp(self, msg):
        return collapse("""\
            \x1Fquery\x1F - Look up a Japanese word or phrase in Jim
            Breen's WWWJDIC <http://wwwjdic.org/>.
            """)