最新发布

# 2023-02-09
创维电视能不能升级鸿蒙系统
创维电视不能升级鸿蒙系统目前只有华为智慧屏和荣耀智慧屏搭载了鸿蒙系统，而鸿蒙系统刚刚正式发布，创维还没有正式加入鸿蒙阵营，短期内不会为创维电视适配鸿蒙系统，所以目前创维电视是无法升级鸿蒙系统的。由于鸿蒙系统是完全开源的系统，基于安卓系统的智
# 2023-02-09
鸿蒙HarmonyOS系统用户已突破3000万，跻身第三大操作系统？
华为HarmonyOS操作系统用户已经突破3000万，计划2021年底突破三亿台设备北京时间7 月 8 日，华为官方透露，华为 Harmony OS 2.0 用户已经达到 3000 万。新系统发布仅一个多月，相当于每天有一百
# 2023-02-09
华为HarmonyOS与安卓对比：鸿蒙系统的强大不止于此
自从HarmonyOS 2上线后，HarmonyOS优越的性能表现让大家眼前一亮，我认为该系统最大的优点就是可在后台打开多个大型游戏且能保持游戏不中断，即后台保活率高。据测评媒体@小白测评的实验数据显示搭载H
# 2023-02-09
harmonyos可以玩原神吗
harmonyos可以玩原神。根据天眼查显示，HarmonyOS2系列，现已推送到多款机型中将正在游玩的《原神》一键从Mate40Pro转到MatePad上。《原神》是由上海米哈游制作发行的一款开放世界冒险游戏。华为于7月12日宣布，H
# 2023-02-09
harmonyos是什么意思
harmonyos即鸿蒙系统的意思，正确写法为harmony os。harmony os鸿蒙系统是华为公司在2019年8月9日于东莞举行华为开发者大会（HDC.2019）上正式发布的操作系统。鸿蒙系统面向全场景的分布式操作，将人、设备、
# 2023-02-09
鸿蒙抄袭安卓？看这一篇就够了
01什么是 AOSP ？很多人都说鸿蒙是 AOSP 套壳，那么我们首先得明白什么是 AOSP？ AOSP 是"Android Open Source Project&
# 2023-02-09
鸿蒙系统的缩小屏幕功能
鸿蒙系统的缩小屏幕功能说明如下：首先在屏幕的左侧、右侧滑动并长按打开侧边栏；在侧边栏选择需要分屏的应用，可上下滑动选择应用，可以点击最下方的按钮查看更多应用。选择应用后，将会在屏幕上直接以小窗口的形式显示；可按住上方的横条进行拖动，也可
# 2023-02-09
p50HarmonyOS新桌面有哪些功能？
HarmonyOS 提供服务卡片、大文件夹和小艺建议，让您把重要信息放在眼前，操作更快捷，屏幕也更个性化。P50手机系统为HarmonyOS 2，具体功能如下：状态栏：通过顶部状态栏查看手机状态、通知消息。大文件夹：无需展开文件夹，可一步打
# 2023-02-09
magicos和鸿蒙os区别
magicos和鸿蒙os区别：两者定位不同，技术架构不同。两者定位不同：HarmonyOS旨在替换安卓、最终实现跨平台多设备分布式操作。MagicOS则是在安卓系统、Windows系统以及其它操作系统上叠加荣耀的核心能力，从而让不同生态实
# 2023-02-09
华为手机开机显示Harmony OS是什么情况?
如果您的手机开机进入Harmony OS界面、EMUI界面、FASTBOOT界面，可能因为如下原因：（1）可能是无意按到了开机键+音量键的组合键进入了特殊模式，建议您长按电源键15秒以上，尝试强制重启手机，即可正常进入手机桌面。温馨提醒

用Python统计词频

2023-03-03 15:11:02Python012

用Python统计词频,第1张

def statistics(astr):

# astr.replace("\n", "")

slist = list(astr.split("\t"))

alist = []

[alist.append(i) for i in slist if i not in alist]

alist[-1] = alist[-1].replace("\n", "")

return alist

if __name__ == "__main__":

code_doc = {}

with open("test_data.txt", "r", encoding='utf-8') as fs:

for ln in fs.readlines():

l = statistics(ln)

for t in l:

if t not in code_doc:

code_doc.setdefault(t, 1)

else:

code_doc[t] += 1

for keys in code_doc.keys():

print(keys + ' ' + str(code_doc[keys]))

简单版：

#!/usr/bin/env python3

import re

import jieba

from collections import Counter

fname = 'counttest.txt'

with open(fname) as f:

s = f.read()

pattern = re.compile(r'[a-zA-Z]+\-?[a-zA-Z]*')

english_words = Counter(pattern.findall(s))

other_words = Counter(jieba.cut(pattern.sub('', s)))

print('\n英文单词统计结果：\n'+'-'*17)

print('\n'.join(['{}: {}'.format(i, j) for i, j in english_words.most_common()]))

print('\n中文及符号统计结果：\n'+'-'*19)

print('\n'.join(['{}: {}'.format(i, j) for i, j in other_words.most_common()]))

复杂版：

#!/usr/bin/env python

# -*- coding: utf-8 -*-

from __future__ import print_function, division, unicode_literals

import sys, re, time, os, jieba

from collections import Counter

from datetime import datetime

class WordCounter(object):

def __init__(self, from_file, to_file=None, coding=None, jieba_cut=None):

'''根据设定的进程数，把文件from_file分割成大小基本相同，数量等同与进程数的文件段，

来读取并统计词频，然后把结果写入to_file中，当其为None时直接打印在终端或命令行上。

Args:

@from_file 要读取的文件

@to_file 结果要写入的文件

@coding 文件的编码方式，默认为采用chardet模块读取前1万个字符来自动判断

@jieba_cut 是否启用结巴分词，默认为None

How to use:

w = WordCounter('a.txt', 'b.txt')

w.run()

'''

if not os.path.isfile(from_file):

raise Exception('No such file: 文件不存在')

self.f1 = from_file

self.filesize = os.path.getsize(from_file)

self.f2 = to_file

if coding is None:

try:

import chardet

except ImportError:

os.system('pip install chardet')

print('-'*70)

import chardet

with open(from_file, 'rb') as f:

coding = chardet.detect(f.read(10000))['encoding']

self.coding = coding

self._c = [Counter(), Counter()]

self.jieba = False

if jieba_cut is not None:

self.jieba = True

def run(self):

start = time.time()

if 1:

self.count_direct(self.f1)

if self.f2 not in ['None', 'Null', 'none', 'null', None]:

with open(self.f2, 'wb') as f:

f.write(self.result.encode(self.coding))

else:

print('\nEnglish words:\n' + '-'*15)

print(self.result)

cost = '{:.1f}'.format(time.time()-start)

size = humansize(self.filesize)

tip = '\nFile size: {}. Cost time: {} seconds'

# print(tip.format(size, cost))

self.cost = cost + 's'

def count_direct(self, from_file):

'''直接把文件内容全部读进内存并统计词频'''

start = time.time()

with open(from_file, 'rb') as f:

line = f.read()

for i in range(len(self._c)):

self._c[i].update(self.parse(line)[i])

def parse(self, line): #解析读取的文件流

text = line.decode(self.coding)

text = re.sub(r'\-\n', '', text) #考虑同一个单词被分割成两段的情况，删除行末的-号

pattern = re.compile(r'[a-zA-Z]+\-?[a-zA-Z]*') #判断是否为英文单词

english_words = pattern.findall(text)

rest = pattern.sub('', text)

ex = Counter(jieba.cut(rest)) if self.jieba else Counter(text)

return Counter(english_words), ex

def flush(self): #清空统计结果

self._c = [Counter(), Counter()]

@property

def counter(self): #返回统计结果的Counter类

return self._c

@property

def result(self): #返回统计结果的字符串型式，等同于要写入结果文件的内容

ss = []

for c in self._c:

ss.append(['{}: {}'.format(i, j) for i, j in c.most_common()])

tip = '\n\n中文及符号统计结果:\n'+'-'*15+'\n'

return tip.join(['\n'.join(s) for s in ss])

def humansize(size):

"""将文件的大小转成带单位的形式

>>> humansize(1024) == '1 KB'

True

>>> humansize(1000) == '1000 B'

True

>>> humansize(1024*1024) == '1 M'

True

>>> humansize(1024*1024*1024*2) == '2 G'

True

"""

units = ['B', 'KB', 'M', 'G', 'T']

for unit in units:

if size < 1024:

break

size = size // 1024

return '{} {}'.format(size, unit)

def main():

if len(sys.argv) < 2:

print('Usage: python wordcounter.py from_file to_file')

exit(1)

from_file, to_file = sys.argv[1:3]

args = {'coding' : None, 'jieba_cut': 1}

for i in sys.argv:

for k in args:

if re.search(r'{}=(.+)'.format(k), i):

args[k] = re.findall(r'{}=(.+)'.format(k), i)[0]

w = WordCounter(from_file, to_file, **args)

w.run()

if __name__ == '__main__':

import doctest

doctest.testmod()

main()

更复杂的：如果是比较大的文件，建议采用多进程，详情百度：多进程读取大文件并统计词频 jaket5219999

#! python3

# -*- coding: utf-8 -*-

import os, codecs

import jieba

from collections import Counter

def get_words(txt):

seg_list = jieba.cut(txt)

c = Counter()

for x in seg_list:

if len(x)>1 and x != '\r\n':

c[x] += 1

print('常用词频度统计结果')

for (k,v) in c.most_common(100):

print('%s%s %s %d' % (' '*(5-len(k)), k, '*'*int(v/3), v))

if __name__ == '__main__':

with codecs.open('19d.txt', 'r', 'utf8') as f:

txt = f.read()

get_words(txt)

文件结果词频进程中文

# 上一篇：求个旅游网电子商务网站模板

# 下一篇：如何从电脑上下载歌曲到SD卡上？