본문 바로가기
Artificial Intelligence

[Python] 텍스트마이닝

by 테리는당근을좋아해 2020. 2. 3.

웹 크롤링

import requests # 웹 request 
from bs4 import BeautifulSoup # 웹 크롤링 패키지 

# url에서 source 가져오기
# res = requests.get('http://www.ciokorea.com/t/27240/%EB%8D%B0%EB%B8%8C%EC%98%B5%EC%8A%A4/129646')
# res = requests.get('http://www.ciokorea.com/news/33848')
res = requests.get('http://www.ciokorea.com/column/119945')

# html 객체로 변환
soup = BeautifulSoup(res.content, 'html.parser')

# 기사 본문 가져오기
html = soup.find(class_='node_body')

# 문자열로 변환
text = html.text

# 문자열을 리스트로 나누기
lists = text.split('\n')

 

Konlpy를 이용한 한글 형태소 분석

from konlpy.tag import Twitter # 한글 형태소 추출
from collections import Counter # count
import nltk # natural language toolkit
from nltk.corpus import stopwords

twitter = Twitter() 
morphs = [] 
for sentence in lists: 
    morphs.append(twitter.pos(sentence)) 

print(morphs)

# 명사 전처리
noun_adj_adv_list=[] 
for sentence in morphs : 
    for word, tag in sentence : 
        if tag in ['Alpha'] and ("또" not in word)and ("등" not in word)and ("앞" not in word)and ("생" not in word)and ("를" not in word)and ("여기" not in word)and ("다른" not in word)and ("예" not in word)and ("은" not in word)and ("위해" not in word)and ("다음" not in word)and ("대한" not in word)and ("아주" not in word)and ("그" not in word)and ("도움" not in word)and ("약" not in word)and ("때문" not in word)and ("여러" not in word) and ("더" not in word) and ("이" not in word) and ("의" not in word) and ("및" not in word) and ("것" not in word) and ("내" not in word)and ("나" not in word)and ("수"not in word) and("게"not in word)and("말"not in word) : 
            noun_adj_adv_list.append(word) 
        if tag in ['Noun'] and ("또" not in word)and ("등" not in word)and ("앞" not in word)and ("생" not in word)and ("를" not in word)and ("여기" not in word)and ("다른" not in word)and ("예" not in word)and ("은" not in word)and ("위해" not in word)and ("다음" not in word)and ("대한" not in word)and ("아주" not in word)and ("그" not in word)and ("도움" not in word)and ("약" not in word)and ("때문" not in word)and ("여러" not in word) and ("더" not in word) and ("이" not in word) and ("의" not in word) and ("및" not in word) and ("것" not in word) and ("내" not in word)and ("나" not in word)and ("수"not in word) and("게"not in word)and("말"not in word) : 
            noun_adj_adv_list.append(word) 
print(noun_adj_adv_list)

 

빈도 출력 및 시각화

from wordcloud import WordCloud # 텍스트 시각화
import matplotlib.pyplot as plt # 시각화 패키지
import matplotlib 
from IPython.display import set_matplotlib_formats

# 글자 수 세기
count = Counter(noun_adj_adv_list)
words = dict(count.most_common())
print(words)

#word cloud show
matplotlib.rc('font',family = 'Malgun Gothic') 
set_matplotlib_formats('retina') 
matplotlib.rc('axes',unicode_minus = False)

wordcloud = WordCloud(font_path = './Arial.ttf', background_color='white',colormap = "Accent_r", width=1500, height=1000).generate_from_frequencies(words) 
plt.imshow(wordcloud) 
plt.axis('off') 
plt.show()

 

출력 결과

 

 

2020년 비즈니스 지형을 뒤흔들 8가지 기술
원문보기:
http://www.ciokorea.com/t/27240/%EB%8D%B0%EB%B8%8C%EC%98%B5%EC%8A%A4/129646#csidx7737536aa7076bcaa44b069cb2c524b 

 

AI를 100% 활용하기 위해 필요한 능력 5가지

원문보기:
http://www.ciokorea.com/news/33848#csidx8410dd4df0004018caef51b60e27e06 

 

칼럼 | RPA 는 정말 미래의 기술인가 아니면 신조어에 불과한 것인가?

원문보기:
http://www.ciokorea.com/column/119945#csidxcbb4b112585cabcb964678363d57313 

 

소스코드

from konlpy.tag import Twitter # 한글 형태소 추출
from collections import Counter # count

from wordcloud import WordCloud # 텍스트 시각화
import matplotlib.pyplot as plt # 시각화 패키지

import nltk # natural language toolkit
from nltk.corpus import stopwords
 
import matplotlib 
from IPython.display import set_matplotlib_formats

import requests # 웹 request 
from bs4 import BeautifulSoup # 웹 크롤링 패키지 

# url에서 source 가져오기
# res = requests.get('http://www.ciokorea.com/t/27240/%EB%8D%B0%EB%B8%8C%EC%98%B5%EC%8A%A4/129646')
# res = requests.get('http://www.ciokorea.com/news/33848')
res = requests.get('http://www.ciokorea.com/column/119945')

# html 객체로 변환
soup = BeautifulSoup(res.content, 'html.parser')

# 기사 본문 가져오기
html = soup.find(class_='node_body')

# 문자열로 변환
text = html.text

# 문자열을 리스트로 나누기
lists = text.split('\n')


twitter = Twitter() 
morphs = [] 
for sentence in lists: 
    morphs.append(twitter.pos(sentence)) 

print(morphs)

# 명사 전처리
noun_adj_adv_list=[] 
for sentence in morphs : 
    for word, tag in sentence : 
        if tag in ['Alpha'] and ("또" not in word)and ("등" not in word)and ("앞" not in word)and ("생" not in word)and ("를" not in word)and ("여기" not in word)and ("다른" not in word)and ("예" not in word)and ("은" not in word)and ("위해" not in word)and ("다음" not in word)and ("대한" not in word)and ("아주" not in word)and ("그" not in word)and ("도움" not in word)and ("약" not in word)and ("때문" not in word)and ("여러" not in word) and ("더" not in word) and ("이" not in word) and ("의" not in word) and ("및" not in word) and ("것" not in word) and ("내" not in word)and ("나" not in word)and ("수"not in word) and("게"not in word)and("말"not in word) : 
            noun_adj_adv_list.append(word) 
        if tag in ['Noun'] and ("또" not in word)and ("등" not in word)and ("앞" not in word)and ("생" not in word)and ("를" not in word)and ("여기" not in word)and ("다른" not in word)and ("예" not in word)and ("은" not in word)and ("위해" not in word)and ("다음" not in word)and ("대한" not in word)and ("아주" not in word)and ("그" not in word)and ("도움" not in word)and ("약" not in word)and ("때문" not in word)and ("여러" not in word) and ("더" not in word) and ("이" not in word) and ("의" not in word) and ("및" not in word) and ("것" not in word) and ("내" not in word)and ("나" not in word)and ("수"not in word) and("게"not in word)and("말"not in word) : 
            noun_adj_adv_list.append(word) 
print(noun_adj_adv_list)

# 글자 수 세기
count = Counter(noun_adj_adv_list)
words = dict(count.most_common())
print(words)

#word cloud show
matplotlib.rc('font',family = 'Malgun Gothic') 
set_matplotlib_formats('retina') 
matplotlib.rc('axes',unicode_minus = False)

wordcloud = WordCloud(font_path = './Arial.ttf', background_color='white',colormap = "Accent_r", width=1500, height=1000).generate_from_frequencies(words) 
plt.imshow(wordcloud) 
plt.axis('off') 
plt.show()

댓글