可以不学习stata,只学习r语言吗

Python029

可以不学习stata,只学习r语言吗,第1张

/* Lecture 1: How to import and export data */

//备注不长于一行可用双斜杠

sysuse "auto.dta",clear

//导入系统自带的数据,clear:关闭之前所有的数据

//字符串变量:红颜色的文字变量。蓝颜色的domestic对应0,foreign对于1

replace make="AMC" in 1

//export data

cd"D:\BaiduNetdiskDownload\stata application"

help export delimited /*stata 自带的帮助文档,只导出csv和txt格式*/

/*导出其中部分变量

export delimited [你想导出的变量名称] using filename [if] [in] [,

export_delimited_options]*/

export delim using"auto.txt",replace

export delim make price using "auto.csv",replace

//MAKE 和price是变量名,只导出这两个变量

help import excel

export excel using "auto.xlsx" in 11/L

//in:第几行到第几行(L表示最后一行)

export excel using "auto.xlsx" if price>=4000,replace

//if: 满足某种条件的变量

export excel using "auto.xlsx" firstrow(var),replace

//import txt file

cd"D:\BaiduNetdiskDownload\stata application"

use "D:\BaiduNetdiskDownload\stata application\Data_luanma.dta"

help unicode

clear

unicode encoding set gb18030

unicode translate "Data_luanma.dta"

use "Data_luanma.dta",clear

//打开大数据

set excelxlsxlargefile on

//允许stata打开大文件

import excel " "

//如果内存太小,如何拆分大文件

findit chunky

help chunky

cd"D:\BaiduNetdiskDownload\stata application"

chunky using "数据文件名称", peek(1)

//看一下数据,不导入,(1)表示第一行

chunky using 数据文件名称 ,analyze

//分析

chunky using 数据文件名称 ,chunksize(10m) header(include) stub(数据文件名称)replace

//按10m的大小拆分文件

import delimited using "其中一个数据文件名称”,clear encoding(数据)

//调用拆分后的其中一个数据

sysuse auto.dta,clear

//调用系统数据

//拆分数据

export delim "auto1.csv" in 1,replace

export delim "auto2.csv" in 2, replace

export delim "auto3.csv" in 3,replace

export delim "auto4.csv" in 4, replace

export delim "auto5.csv" in 5/L, replace

help foreach

foreach num of numlist 1/3 5 8 9(10) 100{

display `num'

}

//重新导入数据

foreach num of numlist 1/74{

display "auto `num'"

import delim "auto`num'.csv", clear

save "auto`num'.dta", replace

}

help append

append using "auto2.csv" "auto3.csv""auto4.csv"/*

*/"auto5.csv"

use "auto1.dta", clear

foreach num of numlist 2/5{

append using "auto`num'.dta"

}

save"auto_new.dta", replace

ssc install openall

findit openall

openall, auto?, insheet

//删除多余文件

foreach x of numlist 1/5{

erase"auto`x'.csv"

}

//lecture 3

cd "D:\BaiduNetdiskDownload\stata application"

help format

sysuse auto.dta

//改变字符串的长度,为了方便浏览

format %30s make

//浏览

edit make

//显示字符串和数值变量

edit make price headroom

format %-20s make

//小数点后面两位数

format %3.2f headroom

//

format %10.0g price

//在result窗口展示数据

list make price headroom in 1/10

//变量的标签

sysuse auto.dta, clear

//介绍数据集的基本情况

describe

//改变整个数据的标签

label data "US auto data美国汽车数据"

//改变变量标签

label var price "auto price汽车价格"

//修改虚拟变量标签,先定义后标签

label define origin_v 0 "国产" 1 "进口"

label values foreign origin_v

describe

replace foreign = 2 in 1/8

//每一次定义新标签要重新命名origin

label define origin_new2 0 "国产" 1 "进口" 2 "unknown"

//显示

list make if foreign == 0 //一个等号表示赋值

list make price if make =="AMC Concord" /*

*/ | make =="Merc. Cougar"

list make foreign price if (foreign == 1&price <=5000) /*

*/ | (foreign == 0 &price >3000)

list make price if inlist (make, "AMC Concord" "Merc. cougar", )

sysuse auto.dta , clear

export excel using "auto.xlsx", nolabel replace

import excel using "auto.xlsx", clear

describe

//修改变量名称

rename

rename A price

rename _all, proper//让变量首字母大写,剩下字母小写

//将变量名称批量写入标签, 重要的是所有的变量循环_all

foreach v of varlist _all {

label variable `v' " `v' "

}

//shenme yisi

generate//建立新变量

replace//更改已有变量

cd "D:\BaiduNetdiskDownload\stata application"

sysuse auto.dta, clear

gen price2=price^2 //生产price2的平方

gen price_mpg = price*mpg if foreign == 1 //生成price 和mpg的交叉项 如果foreign=1

replace price_mpg =0 if price_mpg == .

//用0取代缺失值

gen logprice = log(price) //生产price的对数指

gen lnprice = ln(pice) //生成price的自然对数指,其实和上行生成的结果一样的

//出现零的时候取对数会成为missing数据,可能会丢失数据

replace price_mpg = price*mpg //生成price和mpg的交叉项,并取代price_mpg变量

gen pricecateg = 0//生产pricecateg变量,并将数据分组

replace pricecateg = 1 if price >=5000 &price <10000

replace pricecateg = 2 if price >= 10000

0

edit price pricecateg

label define category 0 "less than 5k" 1 "between 5k and 10k" 2 "more than 10k"

label values pricecateg category

//重命名

edit price pricecateg

help egen

egen priceavg3 = mean(price)

gen price_dev = price - priceavg3

//如何分别计算foreign和domestic的均值

sort foreign

//方法一

egen price_avg = mean(price) if foreign == 1

egen price_avg2 = mean(price) if foreign == 0

replace price_avg = price_avg2 if foreign == 0

drop price_avg2

//方法二

by foreign: egen priceavg_by = mean(price)//按照foreign的分类做均值

sort foreign //给foreign按数据大小排序

help tostring // 数值变量变字符串

help destring //字符串变数值变量,字符串不可以做运算的

sysuse auto.dta, clear

edit mpg

tostring mpg, gen(mpg_str)

tostring mpg, replace force//不想产生新的变量,有mpg取代原来的

edit mpg mpg_str

destring mpg_str, replace force//将字符串变数值

edit mpg mpg_str

sysuse auto.dta, clear

encode make, gen(make_num)//将文字变量重新编码成数字

edit make_num make

sysuse auto.dta, clear

//产生虚拟变量

gen dummy_high = 0//生成一个全是零的新变量

replace dummy_high = 1 if price>= 10000//用1取代价格大于某数的dummy high

//另一种方法

gen indicator_hi = (price>10000)//满足括号里面条件的为1

//展示出两个变量不一样的地方(check的方法)

edit price dummy_high indicator_hi

edit dummy_high indicator_hi if dummy_high ~=indicator_high

sum dummy_hgh indicator_high

recode foreign (0=1) (1=2), gen (for_new)

//计算price的四分卫区间, 25%, 50%, 75%

egen price_pc25 = pctile(price),p(25)

egen price_pc50 = pctile(price),p(50)

egen price_pc75 = pctile(price),p(75)//分别计算这这点的数值是多少

gen price_4cat=0

replace price_4cat=1 if price >=price_pc25&price<price_pc50

replace price_4cat=2 if price >=price_pc50&price<price_pc75

replace price_4cat=3 if price >price_pc75

/*数据合并

append merge joinby*/

//数据纵向合并

sysuse auto.dta,clear

keep if foreign == 0//只保留国产的数据

save auto_domestic.dta, replace

sysuse auto.dta, clear

keep if foreign == 1

save auto_foreign.dta,replace

append using auto_domestic.dta//合并

//数据横向合并

sysuse auto.dta, clear

gen id = _n//给横向排序(车型号)

keep make id mpg weight length

save auto_tech.dta, replace

sysuse auto.dta, clear

gen id_=n

drop make mpg weight length//丢掉一些数据

help merge

merge 1:1 id using "auto_tech.dta"

cd"D:\BaiduNetdiskDownload\stata application"

use "nei_sample.dta",clear

edit newid year so2//调用这三个变量

sort newid year

//从小到大排序

//有些变量有些地方是没有观测值的,叫做非平衡样本

gsort newid -year

edit newid year facilityname_origin

gsort -facilityname_origin year

order so2 co newid year//按这个顺序展示这些变量

order newid, before(co)//把某个变量提到某个变量之前

//string variable字符串变量

use"nei_sample.dta",clear

edit newid facilityname_origin year

sort newid facilityname_origin year

gen facility_name = facilityname_origin //生成一个变量

edit facility_name facilityname_origi

replace facility_name = lower(facility_name)//变量名称小写化

upper //变量名称大写花

//trim ltrim rtrim 去掉空格zuo you zhong

replace facility_name = trim(facility_name)

edit facility_name

replace facility_name = ltrim(facility_name)

replace facility_name = rtrim(facility_name)

replace facility_name = subinstr(facility_name,","," ",.)

replace facility_name = subinstr(facility_name,"."," ",.)

replace facility_name = subinstr(facility_name,"/"," ",.)

replace facility_name = subinstr(facility_name,"#"," ",.)

//替代标点,全部替代用空格替代

replace facility_name = subinstr(facility_name,":"," ",.)

replace facility_name = subinstr(facility_name,"’"," ",.)

replace facility_name = subinstr(facility_name,"**"," ",.)

replace facility_name = subinstr(facility_name,":"," ",.)

replace facility_name = subinword( facility_name,"company"," ",.)

replace facility_name = subinstr(facility_name,"co"," ",.)

replace facility_name = subinstr(facility_name,"inc"," ",.)

replace facility_name = subinstr(facility_name,"lp"," ",.)

replace facility_name = ltrim(facility_name)

replace facility_name = subinstr(facility_name,"u s","us",.)

gen flag = 1 if regexm(facility_name,"u s")==1

//生成新的变量 将带有u s 的变量标注为一,帮助寻找

gen flag2 = 1 if regexm(facility_name,"us")==1

split facility_name

gen fac_name = facility_name1+" "+facility name2

edit zipcode

split zipcode,parse(-)

//按照某种符号拆分字符串

edit zipcode

help substr //截取

gen zip5=substr(zipcode,1,5)//生成zip5,表示截取zipcode的前五位

edit zipcode zip5 if length(zip5) ~=5 //展示长度不等于5的zip5和zipcode

edit zip5

gen len_cn = ustrlen(zipcode) //生成中文字符串长度

edit fips

gen fips2 = substr(fips, 1,2)

edit fips2

gen fips3 = substr(fips, 3,3)

edit fips2 fips3

destring fips2, replace force

destring fips3, replace force

tostring fips2 fips3, replace force

edit fips2 fips3

replace fips2="0"+fips2 if length(fips2)==1

replace fips3="0"+fips3 if length(fips3)==2

replace fips3="00"+fips3 if length(fips3)==1

//前面用零补齐,补成五位

help duplicates//重复观测值

sort newid

duplicates report newid year //报告重复观测值

duplicates tag newid year, gen(dup)

tab dup//展示

edit new year if dup>=177

duplicates drop newid year,force //去掉重复样本

duplicates report newid year

ssc install unique //安装unique

unique newid year//展示有几个是唯一的

unique fips

collapse (sum) so2 co nox nh3 voc (first)facilityname_origin fips zipcode, by(newid year)

//关于newid year重复的字符串变量,只取第一个,数值变量加总

collapse (sum) so2 co nox nh3 voc (count)newid , by(fips year)

//关于fips year 加总。。。 数出newid

//改变面板数据的结构

help reshape

keep newid year so2

duplicates drop newid year,force

reshape wide so2 , i(newid) j(year)

reshape long so2, i(newid) j(year)

//

duplicates drop newid year, force

unique newid year

keep newid year so2 co nox voc nh3 sic

reshape wide so2 co nox voc nh3, i(newid sic) j(year)

//数据变少了是因为有的newid对应多个sic

reshape long so2 co nox voc nh3, i(newid sic) j(year)

//reshape来回两次就是平衡面板数据

//quiz reshape id- year-pollutant-emissions

keep newid year so2 co nox voc

ren (so2 co nox voc)(pol1 pol2 pol3 pol4 )

reshape long pol, i(newid year) j(type)

tostring type,replace force

replace type= "so2" if type=="1"

//滞后

edit newid year so2

duplicates drop newid year,force

edit newid year so2

sort newid year

by newid: gen lag1so2=so2[_n-1]

//滞后一行,不一定滞后一期

by newid:gen f1so2=so2[_n+1]

bys newid: gen Nso2 = so2[_N]//最后一期

//滞后一期,解决不平衡面板

xtset newid year

gen lso2=l.so2

//

duplicates drop newid year,force

edit fips newid year

sort fips newid year

by fips year: egen id_sum=count (newid)

edit fips year newid so2

by fips year:egen so2_fips=sum(so2)

//

collapse(sum) so2 co nox nh3 voc (first) facilityname_origin fips

//加总一个地区的所有公司的污染构造这个地区的总污染量

duplicates report newid year

collapse (sum) so2 co nox nh3 voc (count) newid ,by(fips year)

//collapse by 2-digit sic and fips_state and year

gen state = substr(fips,1,2)

gen sic2 = substr(sic,1,2)

collapse (sum) so2 co nox nh3 voc ,by(state sic2 year )

r语言和state的相似之处在于入门容易,上手快。根据查询相关公开信息,两款语言是免费使用的,简洁易懂,运行速度较快。R作为一种统计分析软件,是集统计分析与图形显示于一体的。state更强调一个国家的政权完整性和独立性。

R是统计领域广泛使用的诞生于1980年左右的S语言的一个分支。可以认为R是S语言的一种交互式实现。它的一些主要特征是:

第一,它是完全免费,开放源代码的。可以在它的网站及其镜像中下载任何有关的安装程序、源代码、程序包及其源代码、文档资料。

第二,R是一种可编程的语言。作为一个开放的统计编程环境,语法通俗易懂,很容易学会和掌握语言的语法。

第三,R语言其实就是一种环境平台。它提供平台,而统计分析研究和计算机研究人员可以将各自通过编程形成的统计分析方法以打包(package)的方式放在R语言平台上,供一般的统计分析者直接使用。我们可以不懂统计分析原理,但是我们可以通过写一句命令就可以让软件调用统计分析包帮我执行某一个统计分析。

第四,R语言的开放性, 它的更新速度比一般统计软件,如,SPSS,SAS等快得多。最新的统计分析方法,最复杂的方法都能在R语言上发现。

第五,由于它比SPSS、SAS、Stata,注重于编程,相对来说学习起来具有一定难度,但它属于傻瓜式的编程。你能想到的所有统计相关的工作,R都可以非常简洁的用几行命令帮你完成。