wildcard.c

   1 /*
   2  * Wildcard matching engine for use with SFTP-based file transfer
   3  * programs (PSFTP, new-look PSCP): since SFTP has no notion of
   4  * getting the remote side to do globbing (and rightly so) we have
   5  * to do it locally, by retrieving all the filenames in a directory
   6  * and checking each against the wildcard pattern.
   7  */
   8
   9 #include <assert.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12
  13 #include "putty.h"
  14
  15 /*
  16  * Definition of wildcard syntax:
  17  *
  18  *  - * matches any sequence of characters, including zero.
  19  *  - ? matches exactly one character which can be anything.
  20  *  - [abc] matches exactly one character which is a, b or c.
  21  *  - [a-f] matches anything from a through f.
  22  *  - [^a-f] matches anything _except_ a through f.
  23  *  - [-_] matches - or _; [^-_] matches anything else. (The - is
  24  *    non-special if it occurs immediately after the opening
  25  *    bracket or ^.)
  26  *  - [a^] matches an a or a ^. (The ^ is non-special if it does
  27  *    _not_ occur immediately after the opening bracket.)
  28  *  - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
  29  *  - All other characters are non-special and match themselves.
  30  */
  31
  32 /*
  33  * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.):
  34  *  - backslashes act as escapes even within [] bracket expressions
  35  *  - does not support [!...] for non-matching list (POSIX are weird);
  36  *    NB POSIX allows [^...] as well via "A bracket expression starting
  37  *    with an unquoted circumflex character produces unspecified
  38  *    results". If we wanted to allow [!...] we might want to define
  39  *    [^!] as having its literal meaning (match '^' or '!').
  40  *  - none of the scary [[:class:]] stuff, etc
  41  */
  42
  43 /*
  44  * The wildcard matching technique we use is very simple and
  45  * potentially O(N^2) in running time, but I don't anticipate it
  46  * being that bad in reality (particularly since N will be the size
  47  * of a filename, which isn't all that much). Perhaps one day, once
  48  * PuTTY has grown a regexp matcher for some other reason, I might
  49  * come back and reimplement wildcards by translating them into
  50  * regexps or directly into NFAs; but for the moment, in the
  51  * absence of any other need for the NFA->DFA translation engine,
  52  * anything more than the simplest possible wildcard matcher is
  53  * vast code-size overkill.
  54  *
  55  * Essentially, these wildcards are much simpler than regexps in
  56  * that they consist of a sequence of rigid fragments (? and [...]
  57  * can never match more or less than one character) separated by
  58  * asterisks. It is therefore extremely simple to look at a rigid
  59  * fragment and determine whether or not it begins at a particular
  60  * point in the test string; so we can search along the string
  61  * until we find each fragment, then search for the next. As long
  62  * as we find each fragment in the _first_ place it occurs, there
  63  * will never be a danger of having to backpedal and try to find it
  64  * again somewhere else.
  65  */
  66
  67 enum {
  68     WC_TRAILINGBACKSLASH = 1,
  69     WC_UNCLOSEDCLASS,
  70     WC_INVALIDRANGE
  71 };
  72
  73 /*
  74  * Error reporting is done by returning various negative values
  75  * from the wildcard routines. Passing any such value to wc_error
  76  * will give a human-readable message.
  77  */
  78 const char *wc_error(int value)
  79 {
  80     value = abs(value);
  81     switch (value) {
  82       case WC_TRAILINGBACKSLASH:
  83         return "'\' occurred at end of string (expected another character)";
  84       case WC_UNCLOSEDCLASS:
  85         return "expected ']' to close character class";
  86       case WC_INVALIDRANGE:
  87         return "character range was not terminated (']' just after '-')";
  88     }
  89     return "INTERNAL ERROR: unrecognised wildcard error number";
  90 }
  91
  92 /*
  93  * This is the routine that tests a target string to see if an
  94  * initial substring of it matches a fragment. If successful, it
  95  * returns 1, and advances both `fragment' and `target' past the
  96  * fragment and matching substring respectively. If unsuccessful it
  97  * returns zero. If the wildcard fragment suffers a syntax error,
  98  * it returns <0 and the precise value indexes into wc_error.
  99  */
 100 static int wc_match_fragment(const char **fragment, const char **target)
 101 {
 102     const char *f, *t;
 103
 104     f = *fragment;
 105     t = *target;
 106     /*
 107      * The fragment terminates at either the end of the string, or
 108      * the first (unescaped) *.
 109      */
 110     while (*f && *f != '*' && *t) {
 111         /*
 112          * Extract one character from t, and one character's worth
 113          * of pattern from f, and step along both. Return 0 if they
 114          * fail to match.
 115          */
 116         if (*f == '\\') {
 117             /*
 118              * Backslash, which means f[1] is to be treated as a
 119              * literal character no matter what it is. It may not
 120              * be the end of the string.
 121              */
 122             if (!f[1])
 123                 return -WC_TRAILINGBACKSLASH;   /* error */
 124             if (f[1] != *t)
 125                 return 0;              /* failed to match */
 126             f += 2;
 127         } else if (*f == '?') {
 128             /*
 129              * Question mark matches anything.
 130              */
 131             f++;
 132         } else if (*f == '[') {
 133             int invert = 0;
 134             int matched = 0;
 135             /*
 136              * Open bracket introduces a character class.
 137              */
 138             f++;
 139             if (*f == '^') {
 140                 invert = 1;
 141                 f++;
 142             }
 143             while (*f != ']') {
 144                 if (*f == '\\')
 145                     f++;               /* backslashes still work */
 146                 if (!*f)
 147                     return -WC_UNCLOSEDCLASS;   /* error again */
 148                 if (f[1] == '-') {
 149                     int lower, upper, ourchr;
 150                     lower = (unsigned char) *f++;
 151                     f++;               /* eat the minus */
 152                     if (*f == ']')
 153                         return -WC_INVALIDRANGE;   /* different error! */
 154                     if (*f == '\\')
 155                         f++;           /* backslashes _still_ work */
 156                     if (!*f)
 157                         return -WC_UNCLOSEDCLASS;   /* error again */
 158                     upper = (unsigned char) *f++;
 159                     ourchr = (unsigned char) *t;
 160                     if (lower > upper) {
 161                         int t = lower; lower = upper; upper = t;
 162                     }
 163                     if (ourchr >= lower && ourchr <= upper)
 164                         matched = 1;
 165                 } else {
 166                     matched |= (*t == *f++);
 167                 }
 168             }
 169             if (invert == matched)
 170                 return 0;              /* failed to match character class */
 171             f++;                       /* eat the ] */
 172         } else {
 173             /*
 174              * Non-special character matches itself.
 175              */
 176             if (*f != *t)
 177                 return 0;
 178             f++;
 179         }
 180         /*
 181          * Now we've done that, increment t past the character we
 182          * matched.
 183          */
 184         t++;
 185     }
 186     if (!*f || *f == '*') {
 187         /*
 188          * We have reached the end of f without finding a mismatch;
 189          * so we're done. Update the caller pointers and return 1.
 190          */
 191         *fragment = f;
 192         *target = t;
 193         return 1;
 194     }
 195     /*
 196      * Otherwise, we must have reached the end of t before we
 197      * reached the end of f; so we've failed. Return 0.
 198      */
 199     return 0;
 200 }
 201
 202 /*
 203  * This is the real wildcard matching routine. It returns 1 for a
 204  * successful match, 0 for an unsuccessful match, and <0 for a
 205  * syntax error in the wildcard.
 206  */
 207 int wc_match(const char *wildcard, const char *target)
 208 {
 209     int ret;
 210
 211     /*
 212      * Every time we see a '*' _followed_ by a fragment, we just
 213      * search along the string for a location at which the fragment
 214      * matches. The only special case is when we see a fragment
 215      * right at the start, in which case we just call the matching
 216      * routine once and give up if it fails.
 217      */
 218     if (*wildcard != '*') {
 219         ret = wc_match_fragment(&wildcard, &target);
 220         if (ret <= 0)
 221             return ret;                /* pass back failure or error alike */
 222     }
 223
 224     while (*wildcard) {
 225         assert(*wildcard == '*');
 226         while (*wildcard == '*')
 227             wildcard++;
 228
 229         /*
 230          * It's possible we've just hit the end of the wildcard
 231          * after seeing a *, in which case there's no need to
 232          * bother searching any more because we've won.
 233          */
 234         if (!*wildcard)
 235             return 1;
 236
 237         /*
 238          * Now `wildcard' points at the next fragment. So we
 239          * attempt to match it against `target', and if that fails
 240          * we increment `target' and try again, and so on. When we
 241          * find we're about to try matching against the empty
 242          * string, we give up and return 0.
 243          */
 244         ret = 0;
 245         while (*target) {
 246             const char *save_w = wildcard, *save_t = target;
 247
 248             ret = wc_match_fragment(&wildcard, &target);
 249
 250             if (ret < 0)
 251                 return ret;            /* syntax error */
 252
 253             if (ret > 0 && !*wildcard && *target) {
 254                 /*
 255                  * Final special case - literally.
 256                  *
 257                  * This situation arises when we are matching a
 258                  * _terminal_ fragment of the wildcard (that is,
 259                  * there is nothing after it, e.g. "*a"), and it
 260                  * has matched _too early_. For example, matching
 261                  * "*a" against "parka" will match the "a" fragment
 262                  * against the _first_ a, and then (if it weren't
 263                  * for this special case) matching would fail
 264                  * because we're at the end of the wildcard but not
 265                  * at the end of the target string.
 266                  *
 267                  * In this case what we must do is measure the
 268                  * length of the fragment in the target (which is
 269                  * why we saved `target'), jump straight to that
 270                  * distance from the end of the string using
 271                  * strlen, and match the same fragment again there
 272                  * (which is why we saved `wildcard'). Then we
 273                  * return whatever that operation returns.
 274                  */
 275                 target = save_t + strlen(save_t) - (target - save_t);
 276                 wildcard = save_w;
 277                 return wc_match_fragment(&wildcard, &target);
 278             }
 279
 280             if (ret > 0)
 281                 break;
 282             target++;
 283         }
 284         if (ret > 0)
 285             continue;
 286         return 0;
 287     }
 288
 289     /*
 290      * If we reach here, it must be because we successfully matched
 291      * a fragment and then found ourselves right at the end of the
 292      * wildcard. Hence, we return 1 if and only if we are also
 293      * right at the end of the target.
 294      */
 295     return (*target ? 0 : 1);
 296 }
 297
 298 /*
 299  * Another utility routine that translates a non-wildcard string
 300  * into its raw equivalent by removing any escaping backslashes.
 301  * Expects a target string buffer of anything up to the length of
 302  * the original wildcard. You can also pass NULL as the output
 303  * buffer if you're only interested in the return value.
 304  *
 305  * Returns 1 on success, or 0 if a wildcard character was
 306  * encountered. In the latter case the output string MAY not be
 307  * zero-terminated and you should not use it for anything!
 308  */
 309 int wc_unescape(char *output, const char *wildcard)
 310 {
 311     while (*wildcard) {
 312         if (*wildcard == '\\') {
 313             wildcard++;
 314             /* We are lenient about trailing backslashes in non-wildcards. */
 315             if (*wildcard) {
 316                 if (output)
 317                     *output++ = *wildcard;
 318                 wildcard++;
 319             }
 320         } else if (*wildcard == '*' || *wildcard == '?' ||
 321                    *wildcard == '[' || *wildcard == ']') {
 322             return 0;                  /* it's a wildcard! */
 323         } else {
 324             if (output)
 325                 *output++ = *wildcard;
 326             wildcard++;
 327         }
 328     }
 329     if (output)
 330         *output = '\0';
 331     return 1;                          /* it's clean */
 332 }
 333
 334 #ifdef TESTMODE
 335
 336 struct test {
 337     const char *wildcard;
 338     const char *target;
 339     int expected_result;
 340 };
 341
 342 const struct test fragment_tests[] = {
 343     /*
 344      * We exhaustively unit-test the fragment matching routine
 345      * itself, which should save us the need to test all its
 346      * intricacies during the full wildcard tests.
 347      */
 348     {"abc", "abc", 1},
 349     {"abc", "abd", 0},
 350     {"abc", "abcd", 1},
 351     {"abcd", "abc", 0},
 352     {"ab[cd]", "abc", 1},
 353     {"ab[cd]", "abd", 1},
 354     {"ab[cd]", "abe", 0},
 355     {"ab[^cd]", "abc", 0},
 356     {"ab[^cd]", "abd", 0},
 357     {"ab[^cd]", "abe", 1},
 358     {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
 359     {"ab\\*", "ab*", 1},
 360     {"ab\\?", "ab*", 0},
 361     {"ab?", "abc", 1},
 362     {"ab?", "ab", 0},
 363     {"ab[", "abc", -WC_UNCLOSEDCLASS},
 364     {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
 365     {"ab[c-]", "abb", -WC_INVALIDRANGE},
 366     {"ab[c-e]", "abb", 0},
 367     {"ab[c-e]", "abc", 1},
 368     {"ab[c-e]", "abd", 1},
 369     {"ab[c-e]", "abe", 1},
 370     {"ab[c-e]", "abf", 0},
 371     {"ab[e-c]", "abb", 0},
 372     {"ab[e-c]", "abc", 1},
 373     {"ab[e-c]", "abd", 1},
 374     {"ab[e-c]", "abe", 1},
 375     {"ab[e-c]", "abf", 0},
 376     {"ab[^c-e]", "abb", 1},
 377     {"ab[^c-e]", "abc", 0},
 378     {"ab[^c-e]", "abd", 0},
 379     {"ab[^c-e]", "abe", 0},
 380     {"ab[^c-e]", "abf", 1},
 381     {"ab[^e-c]", "abb", 1},
 382     {"ab[^e-c]", "abc", 0},
 383     {"ab[^e-c]", "abd", 0},
 384     {"ab[^e-c]", "abe", 0},
 385     {"ab[^e-c]", "abf", 1},
 386     {"ab[a^]", "aba", 1},
 387     {"ab[a^]", "ab^", 1},
 388     {"ab[a^]", "abb", 0},
 389     {"ab[^a^]", "aba", 0},
 390     {"ab[^a^]", "ab^", 0},
 391     {"ab[^a^]", "abb", 1},
 392     {"ab[-c]", "ab-", 1},
 393     {"ab[-c]", "abc", 1},
 394     {"ab[-c]", "abd", 0},
 395     {"ab[^-c]", "ab-", 0},
 396     {"ab[^-c]", "abc", 0},
 397     {"ab[^-c]", "abd", 1},
 398     {"ab[\\[-\\]]", "abZ", 0},
 399     {"ab[\\[-\\]]", "ab[", 1},
 400     {"ab[\\[-\\]]", "ab\\", 1},
 401     {"ab[\\[-\\]]", "ab]", 1},
 402     {"ab[\\[-\\]]", "ab^", 0},
 403     {"ab[^\\[-\\]]", "abZ", 1},
 404     {"ab[^\\[-\\]]", "ab[", 0},
 405     {"ab[^\\[-\\]]", "ab\\", 0},
 406     {"ab[^\\[-\\]]", "ab]", 0},
 407     {"ab[^\\[-\\]]", "ab^", 1},
 408     {"ab[a-fA-F]", "aba", 1},
 409     {"ab[a-fA-F]", "abF", 1},
 410     {"ab[a-fA-F]", "abZ", 0},
 411 };
 412
 413 const struct test full_tests[] = {
 414     {"a", "argh", 0},
 415     {"a", "ba", 0},
 416     {"a", "a", 1},
 417     {"a*", "aardvark", 1},
 418     {"a*", "badger", 0},
 419     {"*a", "park", 0},
 420     {"*a", "pArka", 1},
 421     {"*a", "parka", 1},
 422     {"*a*", "park", 1},
 423     {"*a*", "perk", 0},
 424     {"?b*r?", "abracadabra", 1},
 425     {"?b*r?", "abracadabr", 0},
 426     {"?b*r?", "abracadabzr", 0},
 427 };
 428
 429 int main(void)
 430 {
 431     int i;
 432     int fails, passes;
 433
 434     fails = passes = 0;
 435
 436     for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
 437         const char *f, *t;
 438         int eret, aret;
 439         f = fragment_tests[i].wildcard;
 440         t = fragment_tests[i].target;
 441         eret = fragment_tests[i].expected_result;
 442         aret = wc_match_fragment(&f, &t);
 443         if (aret != eret) {
 444             printf("failed test: /%s/ against /%s/ returned %d not %d\n",
 445                    fragment_tests[i].wildcard, fragment_tests[i].target,
 446                    aret, eret);
 447             fails++;
 448         } else
 449             passes++;
 450     }
 451
 452     for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
 453         const char *f, *t;
 454         int eret, aret;
 455         f = full_tests[i].wildcard;
 456         t = full_tests[i].target;
 457         eret = full_tests[i].expected_result;
 458         aret = wc_match(f, t);
 459         if (aret != eret) {
 460             printf("failed test: /%s/ against /%s/ returned %d not %d\n",
 461                    full_tests[i].wildcard, full_tests[i].target,
 462                    aret, eret);
 463             fails++;
 464         } else
 465             passes++;
 466     }
 467
 468     printf("passed %d, failed %d\n", passes, fails);
 469
 470     return 0;
 471 }
 472
 473 #endif