Inverted Index (反向索引)是搜索引擎需要做的一件经常性工作。在Google提出Map Reduce分布式编程框架中,这是一件很容易完成的事情。下面就是一个python写的示例。

Map:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/usr/bin/python
#
#  InvertedIndex mapper in Python
#  Author: Zeng, Xi
#  SID:    1010105140
#  Email:  [email protected]
 
import sys
 
def main(argv):
  line = sys.stdin.readline().lower()
  words = line.split()
  try:
    while line:
      for word in words[1:]:
        print line.split()[0] + "\t" + word
      line = sys.stdin.readline().lower()
      words = line.split()
  except "end of file":
    return None
if __name__ == "__main__":
  main(sys.argv)

Reduce:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/python
#
#  InvertedIndex reducer in Python
#  Author: Zeng, Xi
#  SID:    1010105140
#  Email:  [email protected]
 
import sys
index_list = {}
 
## collect (key,val) pairs from sort phase
for line in sys.stdin:
    try:
        word, index = line.strip().split("\t", 2)
 
        if index not in index_list:
            index_list[index] = word
        else:
            if word not in index_list[index]:
                index_list[index] = index_list[index] + " " + word
 
    except ValueError, err:
        sys.stderr.write("Value ERROR: %(err)s\n%(data)s\n" % {"err": str(err), "data": line})
 
## emit results
for word, index in index_list.items():
    print " ".join([word, index])