python 文本文件数据处理

Python014

python 文本文件数据处理,第1张

分隔日志文件存为小文件

#coding:utf-8

#file: FileSplit.py

import os,os.path,time

def FileSplit(sourceFile, targetFolder):

sFile = open(sourceFile, 'r')

number = 100000 #每个小文件中保存100000条数据

dataLine = sFile.readline()

tempData = [] #缓存列表

fileNum = 1

if not os.path.isdir(targetFolder):  #如果目标目录不存在,则创建

os.mkdir(targetFolder)

while dataLine: #有数据

for row in range(number):

tempData.append(dataLine) #将一行数据添加到列表中

dataLine = sFile.readline()

if not dataLine :

break

tFilename = os.path.join(targetFolder,os.path.split(sourceFile)[1] + str(fileNum) + ".txt")

tFile = open(tFilename, 'a+') #创建小文件

tFile.writelines(tempData) #将列表保存到文件中

tFile.close()

tempData = [] #清空缓存列表

print(tFilename + " 创建于: " + str(time.ctime()))

fileNum += 1 #文件编号

sFile.close()

if __name__ == "__main__" :

FileSplit("access.log","access")

分类汇总小文件:

#coding:utf-8

#file: Map.py

import os,os.path,re

def Map(sourceFile, targetFolder):

sFile = open(sourceFile, 'r')

dataLine = sFile.readline()

tempData = {} #缓存列表

if not os.path.isdir(targetFolder):  #如果目标目录不存在,则创建

os.mkdir(targetFolder)

while dataLine: #有数据

p_re = re.compile(r'(GET|POST)\s(.*?)\sHTTP/1.[01]',re.IGNORECASE) #用正则表达式解析数据

match = p_re.findall(dataLine)

if match:

visitUrl = match[0][1]

if visitUrl in tempData:

tempData[visitUrl] += 1

else:

tempData[visitUrl] = 1

dataLine = sFile.readline() #读入下一行数据

sFile.close()

tList = []

for key,value in sorted(tempData.items(),key = lambda k:k[1],reverse = True):

tList.append(key + " " + str(value) + '\n')

tFilename = os.path.join(targetFolder,os.path.split(sourceFile)[1] + "_map.txt")

tFile = open(tFilename, 'a+') #创建小文件

tFile.writelines(tList) #将列表保存到文件中

tFile.close()

if __name__ == "__main__" :

Map("access\\access.log1.txt","access")

Map("access\\access.log2.txt","access")

Map("access\\access.log3.txt","access")

3. 再次将多个文件分类汇总为一个文件。

#coding:utf-8

#file: Reduce.py

import os,os.path,re

def Reduce(sourceFolder, targetFile):

tempData = {} #缓存列表

p_re = re.compile(r'(.*?)(\d{1,}$)',re.IGNORECASE) #用正则表达式解析数据

for root,dirs,files in os.walk(sourceFolder):

for fil in files:

if fil.endswith('_map.txt'): #是reduce文件

sFile = open(os.path.abspath(os.path.join(root,fil)), 'r')

dataLine = sFile.readline()

while dataLine: #有数据

subdata = p_re.findall(dataLine) #用空格分割数据

#print(subdata[0][0],"  ",subdata[0][1])

if subdata[0][0] in tempData:

tempData[subdata[0][0]] += int(subdata[0][1])

else:

tempData[subdata[0][0]] = int(subdata[0][1])

dataLine = sFile.readline() #读入下一行数据

sFile.close()

tList = []

for key,value in sorted(tempData.items(),key = lambda k:k[1],reverse = True):

tList.append(key + " " + str(value) + '\n')

tFilename = os.path.join(sourceFolder,targetFile + "_reduce.txt")

tFile = open(tFilename, 'a+') #创建小文件

tFile.writelines(tList) #将列表保存到文件中

tFile.close()

if __name__ == "__main__" :

Reduce("access","access")

请详细阅读文档,使用a+的选项打开。

open(filename[, mode[, bufsize]])¶

Open a file, returning an object of the file type described in section File Objects. If the file cannot be opened, IOError is raised. When opening a file, it’s preferable to use open() instead of invoking the file constructor directly.

The first two arguments are the same as for stdio‘s fopen(): filename is the file name to be opened, and mode is a string indicating how the file is to be opened.

The most commonly-used values of mode are 'r' for reading, 'w' for writing (truncating the file if it already exists), and 'a' for appending (which on some Unix systems means that all writes append to the end of the file regardless of the current seek position). If mode is omitted, it defaults to 'r'. The default is to use text mode, which may convert '\n' characters to a platform-specific representation on writing and back on reading. Thus, when opening a binary file, you should append 'b' to the mode value to open the file in binary mode, which will improve portability. (Appending 'b' is useful even on systems that don’t treat binary and text files differently, where it serves as documentation.) See below for more possible values of mode.

The optional bufsize argument specifies the file’s desired buffer size: 0 means unbuffered, 1 means line buffered, any other positive value means use a buffer of (approximately) that size. A negative bufsize means to use the system default, which is usually line buffered for tty devices and fully buffered for other files. If omitted, the system default is used. [2]

Modes 'r+', 'w+' and 'a+' open the file for updating (note that 'w+' truncates the file). Append 'b' to the mode to open the file in binary mode, on systems that differentiate between binary and text fileson systems that don’t have this distinction, adding the 'b' has no effect.

In addition to the standard fopen() values mode may be 'U' or 'rU'. Python is usually built with universal newline supportsupplying 'U' opens the file as a text file, but lines may be terminated by any of the following: the Unix end-of-line convention '\n', the Macintosh convention '\r', or the Windows convention '\r\n'. All of these external representations are seen as '\n' by the Python program. If Python is built without universal newline support a mode with 'U' is the same as normal text mode. Note that file objects so opened also have an attribute called newlines which has a value of None (if no newlines have yet been seen), '\n', '\r', '\r\n', or a tuple containing all the newline types seen.

Python enforces that the mode, after stripping 'U', begins with 'r', 'w' or 'a'.

Python provides many file handling modules including fileinput, os, os.path, tempfile, and shutil.

Changed in version 2.5: Restriction on first letter of mode string introduced.

ord(c)¶