Thursday, April 17, 2014

LeetCode: Implement strStr()

Implement strStr().

Returns a pointer to the first occurrence of needle in haystack, or null if needle is not part of haystack.


The naive approach for this problem is to compare from the start and when one mismatch is found, restart from the next position of the haystack. There are also some classic algorithm for this problem, such as Rabin–Karp string search algorithm, which is the similar idea as the naive one; Boyer–Moore string search algorithm, which is complicated but was used in the implementation of glibc. And Knuth–Morris–Pratt algorithm, which used a pre-computed table to boost the searching.

I implemented the naive idea in C++:

/*
 * func: str_str
 * goal: find the first occurence of needle in haystack
 * @param haystack: the source string
 * @param needle: the target string
 * return: the pointer to the first occurence of target string
 * complexity: time O(n^2) where n is the length of haystack
 */
char* str_str(char* haystack, char* needle){
    //If needle is null, return haystack
    if(needle == nullptr || haystack == nullptr){
        return haystack;
    }
    
    size_t haystack_len = strlen(haystack);
    size_t needle_len = strlen(needle);
    if(needle_len > haystack_len){
        return nullptr;
    }
    
    char* p_haystack = haystack;
    while(*(p_haystack+needle_len-1)){
        char* haystack_start = p_haystack;
        char* needle_start = needle;
        while(*haystack_start && *needle_start && *haystack_start == *needle_start){
            ++haystack_start;
            ++needle_start;
        }
        //If needle_start goes to the end, we have find one
        if(! *needle_start){
            return p_haystack;
        }
        ++p_haystack;
    }
    return nullptr;
}

Then I looked up online to find the how each compiler implemented this function. I found some source code from opensource.apple.com, which didn't seem to be real since the codes are not elegant and there were some risks. However, assuming they were true, they also applied the naive way to implement.

/*
 * func: strncmp_libc_apple
 * goal: compare the two string within size n
         http://www.opensource.apple.com/source/Libc/Libc-167/gen.subproj/i386.subproj/strncmp.c
 * @param s1: input string s1
 * @param s2: input string s2
 * @param n: length n
 * return: A zero value indicates that the characters compared 
           in both strings form the same string. A value greater than 
           zero indicates that the first character that does not match 
           has a greater value in str1 than in str2; And a value less 
           than zero indicates the opposite.
 */
int strncmp_libc_apple(s1, s2, n)
register const char *s1, *s2;
register size_t n;
{
    if(n == 0){
        return (0);
    }
    //They really like do while...
    do{
        if(*s1 != *s2++)
            return (*(unsigned char *)s1 - *(unsigned char *)--s2);
        //No check for nullptr?
        if(*s1++ == 0)
            break;
    }while(--n != 0);
    return (0);
}


/*
 * func: str_str_xnu
 * goal: find the first occurence of needle in haystack
         source code from Apple opensource
         http://www.opensource.apple.com/source/xnu/xnu-792.13.8/libsa/strstr.c
 * @param in: input string
 * @param str: target string
 * return: the pointer to the first occurence of target string
 */
char* str_str_xnu(const char *in, const char *str){
    char c;
    size_t len;
    //NOTE: Bad access will occur if str is nullptr
    c = *str++;
    if(!c){
        //Trivial empty string case
        return (char *) in;
    }
    
    len = strlen(str);
    do{
        char sc;
        do{
            sc = *in++;
            if(!sc){
                return (char *) 0;
            }
        }while(sc != c);
    }while(strncmp(in, str, len) != 0);
    
    return (char *)(in-1);
}

/*
 * func: str_str_gcc_apple
 * goal: find the first occurence of needle in haystack
         source code from Apple opensource
         http://opensource.apple.com/source/gcc/gcc-937.2/libiberty/strstr.c
 * @param s1: input string
 * @param s2: target string
 * return: the pointer to the first occurence of target string
 */
char* str_str_gcc_apple(char* s1, char* s2){
    register char *p = s1;
    extern char *strchr();
    extern int strncmp();
#if __GNUC__ == 2
    extern __SIZE_TYPE__ strlen;
#endif
    register int len = strlen(s2);
    
    for(;(p = strchr(p, *s2)) != 0; p++){
        if(strncmp(p, s2, len) == 0){
            return (p);
        }
    }
    return (0);
}

/*
 * func: str_str_sun_apple
 * goal: find the first occurence of needle in haystack
         source code from Apple opensource
 * @param string: input string
 * @param substring: target string
 * return: the pointer to the first occurence of target string
 */
char* str_str_sun_apple(register char *string, char *substring){
    register char *a, *b;
    /* First scan quickly through the two strings looking for a 
     * single-character match. When it's found, then compare the
     * rest of the substring.
     */
    b = substring;
    if(*b == 0){
        return string;
    }
    for(;*string != 0; string += 1){
        if(*string != *b){
            continue;
        }
        a = string;
        while(1){
            if(*b == 0){
                return string;
            }
            if(*a++ != *b++){
                break;
            }
        }
        b = substring;
    }
    return (char *) 0;
}

Then I implemented the KMP one with Python.

Python Code:

# func: implement strstr() function in c, which is used to find the
#       first appearance of target string in base text
# @param needle: target string
# @param haystack: base text string
# @return: the first appearance position of target string
# complexity: time O(m+n), space O(m) where m is the length of target string
#             n is the length of text
def str_str(needle, haystack):
    if not needle:
        return haystack
    else:
        #Build look up table for KMP
        kmp_pre = [-1] * len(needle)
        i = 0
        j = -1
        while i < len(needle):
            #If the current letter is not in the prefix
            #Go backward
            while j > -1 and needle[i] != needle[j]:
                j = kmp_pre[j]
            i += 1
            j += 1
            if needle[i] == needle[j]:
                kmp_pre[i] = kmp_pre[j]
            else:
                #The stop position of the prefix
                kmp_pre[i] = j

        #Start KMP matching
        i = 0
        j = 0
        while j < len(haystack):
            while i > -1 and needle[i] != haystack[j]:
                i = kmp_pre[i]
            i += 1
            j += 1
            if i >= len(needle):
                return j-i
        return None

But the submission results from Online Judge is not quite exciting. I compared the running time of KMP and naive way. The naive way is almost 100ms faster than KMP, which amazed me. I think the cause may be related to the contents of the test cases.

Reference Link:

1.Knuth–Morris–Pratt algorithm: http://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm

2.Knuth-Morris-Pratt algorithm: http://www-igm.univ-mlv.fr/~lecroq/string/node8.html

3.Rabin–Karp algorithm: http://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_string_search_algorithm

4.Boyer–Moore string search algorithm: http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm

5.glibc: string/strstr.c Source File: http://fossies.org/dox/glibc-2.19/string_2strstr_8c_source.html

No comments:

Post a Comment