#include <iostream>
#include "wildcard.h"
#include "config.h"
#include "cpluslib/string.h"

//
//  [[Wildcard]] - Shell Wildcard Matcher
//
//  This class provides a means of matching shell wildcards.
//  A pattern is entered into an object of type /Wildcard/, and
//  can then be matched against any number of strings:
//  X{  Wildcard w(my_wild);
//      if(w.match(my_text))
//          std::cout << "my_wild matches my_text\n";}
//
Wildcard::Wildcard(string_t re)
{
    setExpr(re);
}

Wildcard::Wildcard()
{
}

Wildcard::~Wildcard()
{
}

// Make this wildcard match the literal string /s/ and nothing else.
void
Wildcard::setLiteral(string_t s)
{
    code.clear();
    code.push_back(std::make_pair(MATCH_LIT, s));
}

// Make this object match the wildcard /s/. The following wildcard
// constructions are supported:
//
// * `?' matches a single character
// * `*' matches zero or more single characters
// * `[abc]' matches a, b or c
// * `[^abc]' matches all but a, b or c
// * `[a-x]' matches any letter between a and x
// * the backslash can be used to remove the special meaning from the
//   following character ("?*[\" in normal mode, "\-]" in a set).
//
// If the pattern is malformed, returns false and leaves a
// partially-compiled expression. If all works well, returns true.
bool
Wildcard::setExpr(string_t s)
{
    code.clear();

    string_t::size_type n = 0;
    while(n < s.length()) {
        Opcode o;
        string_t r;
        switch(s[n]) {
         case '?':
            code.push_back(std::make_pair(MATCH_SINGLE, string_t()));
            ++n;
            break;
         case '*':
            code.push_back(std::make_pair(MATCH_SEQ, string_t()));
            ++n;
            break;
         case '[':
            ++n;
            if(n >= s.length())
                return false;
            if(s[n] == '^') {
                ++n;
                o = MATCH_NOT;
            } else
                o = MATCH_RANGE;

            r = "";
            while(n < s.length()-1 && s[n] != ']') {
                if(s[n] == '-' && r.length()) {
                    ++n;

                    unsigned char end = s[n];
                    unsigned char start = r[r.length()-1];
                    if(end < start)
                        r.erase(r.length() - 1);
                    else
                        while(start != end)
                            r += ++start;
                    ++n;
                } else {
                    if(s[n] == '\\')
                        ++n;
                    r += s[n];
                    ++n;
                }
            }
            code.push_back(std::make_pair(o, r));
            if(n >= s.length() || s[n] != ']')
                return false;
            ++n;
            break;
         case '\\':
            ++n;
            if(n >= s.length())
                return false;
            /* FALLTHROUGH */
         default:
            if(!code.empty() && code[code.size()-1].first == MATCH_LIT)
                code[code.size()-1].second += s[n];
            else
                code.push_back(std::make_pair(MATCH_LIT, string_t(1, s[n])));
            ++n;
        }
    }
    return true;
}

/*
 *  Possible optimisations:
 *  + optimize `*literal' by directly looking for literal instead of
 *    backtracking each character. Does this buy anything with a decent
 *    compiler?
 *  + optimize `**' => `*' (O(n^2) -> O(n)), `*?' => `?*'
 *  + sort patterns for MATCH_RANGE/MATCH_NOT (O(n) -> O(ld n))
 *  + work on C strings. Since Unix (filenames) and ELF (section/symbol
 *    names) use C strings, there is no point in supporting embedded NULs
 */
bool
Wildcard::matchInternal(string_t s, code_type::size_type n)
{
    /* empty code matches empty string */
    if(n >= code.size())
        return s == "";

    switch(code[n].first) {
     case MATCH_SEQ:
        ++n;
        if(n == code.size())    // speed up `*' matching long string
            return true;
        while(!s.empty()) {
            if(matchInternal(s, n))
                return true;
            s.erase(0, 1);
        }
        return false;
     case MATCH_SINGLE:
        if(s.empty())
            return false;
        return matchInternal(s.substr(1), n+1);
        
     case MATCH_RANGE:
        if(s.empty())
            return false;
        if(code[n].second.find(s[0]) == string_t::npos)
            return false;
        return matchInternal(s.substr(1), n+1);
        
     case MATCH_NOT:
        if(s.empty())
            return false;
        if(code[n].second.find(s[0]) != string_t::npos)
            return false;
        return matchInternal(s.substr(1), n+1);
        
     case MATCH_LIT:
        if(s.compare(0, code[n].second.length(), code[n].second) != 0)
            return false;
        return matchInternal(s.substr(code[n].second.length()), n+1);
    }
}

// Returns true if the string /s/ matches the current wildcard.
bool
Wildcard::match(string_t s)
{
    return matchInternal(s, 0);
}

// Return longest known prefix of this wildcard. That is, for the
// wildcard `foo*', return the string "foo". Can be used to limit the
// range where to search for matches: only words starting with "foo"
// need to be considered as possible matches.
string_t
Wildcard::getPrefix() const
{
    if(code.empty() || code[0].first != MATCH_LIT)
        return string_t();
    else
        return code[0].second;
}

#ifdef DEBUG
void
Wildcard::dump(std::ostream& os)
{
    static const char* names[] = {
        "MATCH_SEQ",
        "MATCH_SINGLE",
        "MATCH_RANGE",
        "MATCH_NOT",
        "MATCH_LIT"
    };
    
    os << "Compiled pattern:\n";
    for(code_type::iterator i = code.begin(); i != code.end(); ++i) {
        os << "\t" << names[i->first] << " `" << i->second << "'\n";
    }
}
#endif
