1. Two Sum，两数和

排在LeetCode最前面的，第1题，Two Sum，寻找两个数的和等于给定的Target，难度等级Easy。但是，这个既是Easy同时还是第1道热身的题，也有非常不简单的地方。

输入一个int array，一个target，在此array中寻找两个数，使这两个数的和等于target。每组输入都有且只有一个解。对于数组中的每个index对应的元素，只能使用一次（如果找到的是两个相等的数，则这两个数的index必不相同）。返回的index可以任意顺序。

暴力搜索（Brute Force）

寻找两个数的和，最简单粗暴的方式，就是两重循环，通过尝试所有的两两组合，来确定这个必定存在的解。由于只有一个解，找到即可返回。

/**
 * Note: The returned array must be malloced, assume caller calls free().
 */
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    int i,j;
    int *rp = NULL;
    *returnSize = 0;
    for (i=0; i<numsSize; ++i)
        for (j=i+1; j<numsSize; ++j)
            if ((nums[i]+nums[j]) == target) {
                rp = (int*)malloc(sizeof(int)*2);
                if (rp == NULL)
                    return rp;
                rp[0] = i;
                rp[1] = j;
                *returnSize = 2;
                return rp;
            }
    return rp;
}

class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        size = len(nums)
        for i in range(size):
            for j in range(i+1,size):
                if nums[i]+nums[j] == target:
                    return [i,j]
        return [-1,-1]  # might be used to indicate error

class Solution {
public:
    vector<int> twoSum(vector<int>& nums, int target) {
        vector rtv{-1,-1};  // might be used to indicate error
        for (int i=0; i<nums.size(); ++i)
            for (int j=i+1; j<nums.size(); ++j)
                if (nums[i]+nums[j] == target) {
                    rtv[0] = i;
                    rtv[1] = j;
                    return rtv;
                }
        return rtv;
    }
};

哈希表查找（Two-pass）

使用Hash Table查找，在没有冲突的情况下是\(O(1)\)。当我们固定一个数\(a\)，\(target - a = b\)，对于某个a，如果b存在（index不同），这就是解。先建立Hash Table，然后在一个循环中查询Hash Table，查询是\(O(1)\)。建立Hash Table需要的时间和空间，与n是线性关系。

Python自带的dict类型，就是非常好用的hash table，所以，用Python来写这段代码，比较简单轻松。

class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        # create hash table,
        # the index with same value will be replaced.
        hash_tab = {v:i for i,v in enumerate(nums)}
        for i,v in enumerate(nums):
            a = target - v
            if a in hash_tab and hash_tab[a]!=i:
                return [i,hash_tab[a]]
        return [-1,-1]

无需顾虑哈希表中的冲突，nums中相同的值在hash_tab中的index值会被最大的index覆盖，根据题目，that's fine，要么这两个相同的元素就是解，在第2次循环时，排在前面的数会找到排在后面的数。要么这两个相同的元素都不是解。

下面是C语言版的解法，代码就比较复杂了，但是执行速度真心快。

typedef struct hash_node {
    int val;
    int idx;
    struct hash_node *next;
} hash_node;

/**
 * Note: The returned array must be malloced, assume caller calls free().
 */
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    // prepare return value
    *returnSize = 0;
    int *prtv = malloc(sizeof(int*) * 2);
    if (prtv == NULL)
        return NULL;  // has to return NULL
    prtv[0] = prtv[1] = -1;  // might be used to indicate error

    // create hash table
    hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
    if (phash == NULL)
        return prtv;
    memset(phash, 0, sizeof(char*)*numsSize);

    // pass one: create hash table nodes
    int key;
    hash_node *pnode, *ptmp;
    for (int i=0; i<numsSize; ++i) {
        key = abs(nums[i]) % numsSize;  // value to key
        pnode = (hash_node*)malloc(sizeof(hash_node));
        if (pnode == NULL)
            goto end;
        pnode->val = nums[i];
        pnode->idx = i;
        pnode->next = NULL;
        if (phash[key] == NULL)
            phash[key] = pnode;
        else {
            ptmp = phash[key];
            while (ptmp->next)
                ptmp = ptmp->next;
            ptmp->next = pnode;            
        }
    }

    // pass two: search
    int v;
    for (int i=0; i<numsSize; ++i) {
        v = target-nums[i];
        key = abs(v) % numsSize;
        if (phash[key] != NULL) {
            ptmp = phash[key];
            do {
                if ((ptmp->val==v) && (ptmp->idx!=i)) {
                    prtv[0] = i;
                    prtv[1] = ptmp->idx;
                    *returnSize = 2;
                    goto end;  // get out of the nested loop
                }
            } while (ptmp=ptmp->next);
        }
    }

end:;  // free and return
    hash_node *prev;
    for (int i=0; i<numsSize; ++i) {
        if (phash[i] != NULL) {
            prev = phash[i];
            do {
                ptmp = prev->next;
                free(prev);
            } while (prev=ptmp);
        }
    }
    free(phash);
    return prtv;
}

这个C版本，建立hash表，从value到key的计算方式为：key = abs(nums[i]) % numsSize。因此，冲突是很可能存在的！这个版本的内存占用比较高！这个实现考虑了冲突，其实是可以不用考虑的，不想改了...后面有uthash版本。

class Solution {
public:
    vector<int> twoSum(vector<int>& nums, int target) {
        int i,v;
        vector rtv {-1,-1};
        unordered_map<int,int> udict;
        udict.reserve(nums.size());
        // create hash table,
        // only save the last index for same value.
        for (i=0; i<nums.size(); ++i)
            udict.emplace(nums[i], i);
        // search
        for (i=0; i<nums.size(); ++i) {
            v = target - nums[i];
            auto t {udict.find(v)};
            if (t!=udict.end() && (udict[v]!=i)) {
                rtv[0] = i;
                rtv[1] = udict[v];
                return rtv;
            }
        }
        return rtv;
    }
};

哈希表查找（One-pass）

One-pass的思路是，在建立Hash Table的同时，其实就可以做查询了，一边建立一边查询，这就是one pass，只遍历一遍数据。解的两个数，一前一后，当循环到后面的那个数的时候，前面的数一定已经在hash表中存在了。而且，这个思路还彻底规避了找到相同index的风险。

class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        hash_tab = {}
        for i,v in enumerate(nums):
            # search first
            a = target - v
            if a in hash_tab:
                return [i,hash_tab[a]]
            # add to hash table
            hash_tab[v] = i
        return [-1,-1]

class Solution {
public:
    vector<int> twoSum(vector<int>& nums, int target) {
        vector rtv {-1,-1};
        int len { static_cast<int>(nums.size()) };
        unordered_map<int,int> ump;
        for(int i{}; i<len; ++i){
            int a = target - nums[i];
            if(ump.contains(a)){
                rtv[0] = i;
                rtv[1] = ump[a];
                return rtv;
            }
            ump[nums[i]] = i;
        }
        return rtv;
    }
};

typedef struct hash_node {
    int val;
    int idx;
    struct hash_node *next;
} hash_node;

/**
 * Note: The returned array must be malloced, assume caller calls free().
 */
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    // prepare return value
    *returnSize = 0;
    int *prtv = (int*)malloc(sizeof(int*) * 2);
    if (prtv == NULL)
        return NULL;  // has to return NULL
    prtv[0] = prtv[1] = -1;  // might be used to indicate error

    // create hash table
    hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
    if (phash == NULL)
        return prtv;
    memset(phash, 0, sizeof(char*)*numsSize);

    // one pass
    int key,v;
    hash_node *pnode, *ptmp;
    for (int i=0; i<numsSize; ++i) {
        // search first
        v = target - nums[i];
        key = abs(v) % numsSize;
        if (phash[key] != NULL) {
            ptmp = phash[key];
            do {
                if (ptmp->val==v) {
                    prtv[0] = i;
                    prtv[1] = ptmp->idx;
                    *returnSize = 2;
                    goto end;  // get out of the nested loop
                }
            } while (ptmp=ptmp->next);
        }
        // add to hash table
        key = abs(nums[i]) % numsSize;
        pnode = (hash_node*)malloc(sizeof(hash_node));
        if (pnode == NULL)
            goto end;
        pnode->val = nums[i];
        pnode->idx = i;
        pnode->next = NULL;
        if (phash[key] == NULL)
            phash[key] = pnode;
        else {
            ptmp = phash[key];
            while (ptmp->next)
                ptmp = ptmp->next;
            ptmp->next = pnode;
        }
    }

end:;  // free and return
    hash_node *prev;
    for (int i=0; i<numsSize; ++i) {
        if (phash[i] != NULL) {
            prev = phash[i];
            do {
                ptmp = prev->next;
                free(prev);
            } while (prev=ptmp);
        }
    }
    free(phash);
    return prtv;
}

One-pass & Memory Pool

C语言的实现，还有个可以优化的细节，即使用memory pool。建立Hash Table的时候，不要一次次的malloc（对应一次次的free），而是事先准备一个足够大小的memory pool，每次从pool中取一块来使用。虽然会造成memory的使用增大，但，从来都是内存换速度呀.....

此题的约束条件中说明，int array的最大长度是10^4。使用下面这个struct hash_node：

typedef struct hash_node {
    int val;
    int idx;
    struct hash_node *next;
} hash_node;

最大需要的内存为：156.25KB，不算特别多，值得一试！代码如下：

typedef struct hash_node {
    int val;
    int idx;
    struct hash_node *next;
} hash_node;


/**
 * Note: The returned array must be malloced, assume caller calls free().
 */
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    // prepare return value
    *returnSize = 0;
    int *prtv = (int*)malloc(sizeof(int*) * 2);
    if(!prtv)
        return NULL;  // has to return NULL
    prtv[0] = prtv[1] = -1;  // might be used to indicate error

    // create hash table
    hash_node **phash = (hash_node**)malloc(sizeof(char*) * numsSize);
    if (phash == NULL)
        return prtv;
    memset(phash, 0, sizeof(char*)*numsSize);
    // malloc memory pool
    hash_node *ht_mem = (hash_node*)malloc(sizeof(hash_node)*numsSize);
    if(!ht_mem){
        free(phash);
        return NULL;
    }
    int ht_idx = 0;

    // one pass
    int key,v;
    hash_node *pnode, *ptmp;
    for (int i=0; i<numsSize; ++i) {
        // search first
        v = target - nums[i];
        key = abs(v) % numsSize;
        if (phash[key] != NULL) {
            ptmp = phash[key];
            do {
                if (ptmp->val==v) {
                    prtv[0] = i;
                    prtv[1] = ptmp->idx;
                    *returnSize = 2;
                    goto end;  // get out of the nested loop
                }
            } while (ptmp=ptmp->next);
        }
        // add to hash table
        key = abs(nums[i]) % numsSize;
        // pnode = (hash_node*)malloc(sizeof(hash_node));
        pnode = (hash_node*)&ht_mem[ht_idx++];
        pnode->val = nums[i];
        pnode->idx = i;
        pnode->next = NULL;
        if (phash[key] == NULL)
            phash[key] = pnode;
        else {
            ptmp = phash[key];
            while (ptmp->next)
                ptmp = ptmp->next;
            ptmp->next = pnode;
        }
    }

end:;  // free and return
    free(phash);
    free(ht_mem);
    return prtv;
}

排序+双向逼近（Two Points）

加速搜索的常规手段，前面已经使用了Hash Table，另一个是先排序，对排序后的数据进行搜索，比如著名的二分搜索算法。但此题不能使用二分搜索，因为要寻找的是两个数，以他们的和作为找到的判定条件。我们可以采用一个巧妙的办法，用双向逼近的方法。这个方法，同样避免了index相同的场景。由于存在一次排序，这个算法的Big-Oh将由排序算法主导，各语言内置的排序算法接口，都非常快，但最快也要\(N\log{N}\)。

class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        n = len(nums)
        a = [(n,i) for i,n in enumerate(nums)]
        a.sort(key=lambda x: x[0])
        left = 0
        right = n-1
        while left < right:
            s = a[left][0] + a[right][0]
            if s == target:
                return [a[left][1], a[right][1]]
            if s < target:
                left += 1
            else:
                right -= 1
        return [-1,-1]

typedef struct node {
    int val;
    int idx;
} node;

int comp_node(const void *pa, const void *pb){
    // 用比较的方式，规避了int计算可能的溢出
    int a = ((node*)pa)->val;
    int b = ((node*)pb)->val;
    return a<b ? -1 : a>b ? 1 : 0;
}

/**
 * Note: The returned array must be malloced, assume caller calls free().
 */
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    *returnSize = 2;
    int *prtv = (int*)malloc(sizeof(int*) * 2);

    // sort by value
    node *pnode = (node*)malloc(sizeof(node) * numsSize);
    /*for(int i=0; i<numsSize; ++i){
        pnode[i].val = nums[i];
        pnode[i].idx = i;
    }*/
    int i = 0;
    int g = numsSize / 16;
    while(g--){
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
    }
    while(i < numsSize){
        pnode[i].val=nums[i]; pnode[i].idx=i; ++i;
    }

    qsort(pnode, numsSize, sizeof(node), comp_node);

    // bi-directional search
    int left = 0;
    int right = numsSize-1;
    while(left < right){
        int sum = pnode[left].val + pnode[right].val;
        if(sum == target){
            prtv[0] = pnode[left].idx;
            prtv[1] = pnode[right].idx;
            free(pnode);
            return prtv;
        }
        if(sum < target)
            ++left;
        else
            --right;
    }

    // never reach here...
    free(pnode);
    return prtv;
}

class Solution {
public:
    vector<int> twoSum(vector<int>& nums, int target) {
        vector rtv {-1,-1};
        vector<pair<int,size_t>> idxnums;
        size_t len { nums.size() };
        idxnums.reserve(len);

        for(size_t i{0}; i<len; ++i)
            idxnums.emplace_back(nums[i],i);

        sort(idxnums.begin(), 
             idxnums.end(), 
             [](decltype(idxnums)::value_type &a,
                decltype(idxnums)::value_type &b){
                    return a.first < b.first;
                });

        size_t left {0};
        size_t right {len-1};
        while(left < right){
            int sum { idxnums[left].first+idxnums[right].first };
            if(sum == target){
                rtv[0] = idxnums[left].second;
                rtv[1] = idxnums[right].second;
                return rtv;
            }
            if(sum < target)
                ++left;
            else
                --right;
        }

        // never reach here
        return rtv;
    }
};

uthash（C语言）

在C语言中构建较为复杂的hash表是比较费劲的，LeetCode提供uthash可以用，想必是uthash用的人多啊。我也来尝试一下，使用one pass算法：

typedef struct node {
    int val;
    int idx;
    UT_hash_handle hh;
} Node;

int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    *returnSize = 2;
    int *prtv = malloc(sizeof(int*)*2);

    Node *pnodes = malloc(sizeof(Node)*numsSize);
    int node_idx = 0;
    Node *htable = NULL;
    Node *ps;
    int search;
    for(int i=0; i<numsSize; ++i){
        ps = NULL;
        search = target - nums[i];
        HASH_FIND_INT(htable, &search, ps);
        if(!ps){
            Node *pa = &pnodes[node_idx++];
            pa->val = nums[i];
            pa->idx = i;
            HASH_ADD_INT(htable, val, pa);
        }
        else{
            prtv[0] = ps->idx;
            prtv[1] = i;
            break;
        }
    }

    HASH_CLEAR(hh, htable);
    free(pnodes);
    return prtv;
}

暴力美学

class Solution:
    def twoSum(self, nums: List[int], target: int) -> List[int]:
        n = len(nums)
        for i in range(n):
            for j in range(i+1,n):
                if nums[i]+nums[j] == target:
                    return [i,j]
                if nums[n-i-1]+nums[n-j-1] == target:
                    return [n-i-1,n-j-1]
        return [0,0]

在每一个loop中，干双份工作，同时按两个方向遍历。存在潜在的重复计算问题，但在此问题上，实际效果非常好！非常好...各个语言的版本都是最好战绩，而且它不消耗内存！

这个方法快的原因：只要找到解，就立即return！同时，解出现的位置，大概率在数组两边！

SIMD

#include <immintrin.h>

class Solution {
public:
    vector<int> twoSum(vector<int>& nums, int target) {
        vector rtv {-1,-1};
        int len { static_cast<int>(nums.size()) };
        unordered_map<int,int> ump;
        __m128i t = _mm_set1_epi32(target);
        int *data { nums.data() };
        int presult[4];
        int i{};
        for(; i<len/4; ++i){
            __m128i a = _mm_loadu_si128((__m128i_u*)(data+i*4));
            __m128i b = _mm_sub_epi32(t, a);
            _mm_storeu_si128((__m128i_u*)presult, b);
            for(int j{}; j<4; ++j){
                if(ump.contains(presult[j])){
                    rtv[0] = i*4+j;
                    rtv[1] = ump[presult[j]];
                    return rtv;
                }
                ump[nums[i*4+j]] = i*4+j;
            }
        }
        for(i=i*4; i<len; ++i){
            int a = target - nums[i];
            if(ump.contains(a)){
                rtv[0] = i;
                rtv[1] = ump[a];
                return rtv;
            }
            ump[nums[i]] = i;
        }
        return rtv;
    }
};

#include <immintrin.h>

class Solution {
public:
    __attribute__((target("avx2")))
    vector<int> twoSum(vector<int>& nums, int target) {
        vector rtv {-1,-1};
        int len { static_cast<int>(nums.size()) };
        unordered_map<int,int> ump;
        __m256i t = _mm256_set1_epi32(target);
        int *data { nums.data() };
        int presult[8];
        int i{};
        for(; i<len/8; ++i){
            __m256i a = _mm256_loadu_si256((__m256i_u*)(data+i*8));
            __m256i b = _mm256_sub_epi32(t, a);
            _mm256_storeu_si256((__m256i_u*)presult, b);
            for(int j{}; j<8; ++j){
                if(ump.contains(presult[j])){
                    rtv[0] = i*8+j;
                    rtv[1] = ump[presult[j]];
                    return rtv;
                }
                ump[nums[i*8+j]] = i*8+j;
            }
        }
        for(i=i*8; i<len; ++i){
            int a = target - nums[i];
            if(ump.contains(a)){
                rtv[0] = i;
                rtv[1] = ump[a];
                return rtv;
            }
            ump[nums[i]] = i;
        }
        return rtv;
    }
};

用C就先不麻烦hash table了，直接用brute force配合AVX的32字节并行能力，用并行加法。

#include <immintrin.h>

__attribute__((target("avx2")))
int* twoSum(int* nums, int numsSize, int target, int* returnSize){
    *returnSize = 2;
    int *prtv = malloc(sizeof(int*)*2);
    int presult[8], j;
    for(int i=0; i<numsSize; ++i){
        if(numsSize-i-1 >= 8){
            __m256i a = _mm256_set1_epi32(nums[i]);
            for(j=0; j<(numsSize-i-1)/8; ++j){
                __m256i b = _mm256_loadu_si256((__m256i_u*)(nums+j*8+i+1));
                __m256i c = _mm256_add_epi32(a,b);
                _mm256_storeu_si256((__m256i_u*)presult, c);
                for(int k=0; k<8; ++k)
                    if(presult[k] == target){
                        prtv[0] = i;
                        prtv[1] = j*8 + i+1 + k;
                        return prtv;
                    }
            }
        } else {
            for(j=i+1; j<numsSize; ++j)
                if(nums[i]+nums[j] == target){
                    prtv[0] = i;
                    prtv[1] = j;
                    return prtv;
                }
        }
    }
    return prtv;
}

纯数字线性复杂度的排序

纯数字的排序，可以做到线性复杂度，不知是否可能比qsort更快？