lines = s.split('\n')
dictlines = lines[:100]
mydict = {}
# read
for i,line in enumerate(dictlines ):
for word in line.split():
mydict.setdefault(word,[]).append(i + 1)
# print indices
for word in mydict.keys():
print "%s: %s" % (word,", ".join(map(str,sorted(mydict[word]))))
def andSearch(words_list):
global mydict
a = set(range(1,101))
for word in words_list:
a = a.intersection(set(mydict[word]))
return a
def orSearch(words_list):
global mydict
a = set([])
for word in words_list:
a = a.union(set(mydict[word]))
return a
# Query
index = 100
u = lines[index]
while index < len(lines):
words_list = u.split()
if ":" in u:
if words_list[0] == "OR:":
a = orSearch(words_list)
else:
if words_list[0] == 'AND:':
words_list = words_list[1:]
a = andSearch(words_list)
if not a:
print ", ".join(map(str,list(a)))
else:
print "None"
index += 1
大致思想就是这样。。。。。。。。
cdays-3-test.txt 内容:
1 key1
2 key2
3 key1
7 key3
8 key2
10 key1
14 key2
19 key4
20 key1
30 key3
读取某一简单索引文件cdays-3-test.txt,其每行格式为文档序号 关键词,现需根据这些信息转化为倒排索引,即统计关键词在哪些文档中,格式如下:包含该关键词的文档数 关键词 =>文档序号。其中,原索引文件作为命令行参数传入主程序,并设计一个collect函式统计 "关键字<->序号" 结果对,最后在主程序中输出结果至屏幕。