Vous êtes sur la page 1sur 6

10/23/2019 xml2df - Jupyter Notebook

In [135]:

import xml.etree.ElementTree as ET
import os
import pandas as pd

In [129]:

def extract_data(file):
xmlfile = file.split('/')[-1].split('.')[0]
text=''
topics=[]
root = ET.parse(file).getroot()
itemid = root.attrib['itemid']
for child in root:
if child.tag == 'headline':
headline = child.text
if child.tag == 'text':
for i in child.findall('p'):
text += i.text
if child.tag == 'metadata':
for dc in child.findall('dc'):
if dc.attrib['element']=='dc.date.published':
published_date =dc.attrib['value']
for codes in child.findall('codes'):
for code in codes.findall('code'):
topics.append(code.attrib['code'])
return(headline, text, topics, published_date, itemid, xmlfile )

In [195]:

def create_dataframe(directory):
df = pd.DataFrame(columns=['Headline','Text','Bip_topic','Publishe
for sub_directory in os.listdir(directory):
for filename in os.listdir(directory+'/'+sub_directory):
values = extract_data(directory+'/'+sub_directory+'/'+file
keys = df.columns
row = pd.Series(values,index=keys)
df = df.append(row,ignore_index=True)
return df

In [204]:

df = create_dataframe('Data/Data')

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 1/6
10/23/2019 xml2df - Jupyter Notebook

In [206]:

df['Bip_topic'].values

Out[206]:

array([list(['FRA', 'M11', 'MCAT']),


list(['USA', 'C12', 'CCAT', 'GCAT', 'GCRIM']),
list(['UK', 'E71', 'ECAT']), ...,
list(['USA', 'M14', 'M141', 'MCAT']), list(['U
K', 'M14', 'MCAT']),
list(['CVI', 'FRA', 'SILEN', 'M14', 'M141', 'MCA
T'])], dtype=object)

In [207]:

flattened_list = [y for x in df['Bip_topic'].values for y in x]

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 2/6
10/23/2019 xml2df - Jupyter Notebook

In [213]:

pd.unique(flattened_list)

Out[213]:

array(['FRA', 'M11', 'MCAT', 'USA', 'C12', 'CCAT', 'GC


AT', 'GCRIM', 'UK',
'E71', 'ECAT', 'M14', 'M141', 'ITALY', 'POL',
'I75000', 'C11',
'I97912', 'C42', 'E41', 'GJOB', 'SLVNIA', 'EE
C', 'E12', 'G15',
'G151', 'G154', 'M13', 'M132', 'RUSS', 'I7640
0', 'C24', 'GWEA',
'E21', 'E211', 'GHEA', 'M143', 'ALB', 'GVIO',
'ISRAEL', 'JORDAN',
'GDIP', 'AUST', 'HUNG', 'I42700', 'C15', 'C15
1', 'JAP', 'I22100',
'C21', 'M142', 'C32', 'FIN', 'NATO', 'GDEF', 'I
21000', 'C18',
'C181', 'I81402', 'C1511', 'C17', 'C174', 'E21
2', 'NIGEA', 'C171',
'I33020', 'I3302003', 'C152', 'SWITZ', 'M12',
'KAMPA', 'GPOL',
'GFR', 'SPAIN', 'GSPO', 'PHLNS', 'I85000', 'I81
502', 'I8150211',
'PAKIS', 'GDIS', 'I37200', 'I36400', 'C13', 'BE
LG', 'CRTIA',
'ZAIRE', 'INDON', 'SINGP', 'I79020', 'C173', 'A
USTR', 'CHINA',
'SKOREA', 'I25700', 'C23', 'I35101', 'INDIA',
'I76300', 'I83940',
'HKONG', 'LEBAN', 'PORL', 'IRE', 'I41000', 'I64
100', 'EGYPT',
'LIBYA', 'SAARAB', 'GREL', 'C22', 'IRAQ', 'I140
00', 'C33',
'SRILAN', 'GODD', 'I36101', 'CANA', 'I41300',
'I02000', 'I47100',
'C182', 'I42400', 'I01001', 'I0100132', 'I7400
0', 'NZ', 'PERU',
'E11', 'E51', 'I83960', 'I25000', 'I25800', 'M1
31', 'THAIL',
'I24200', 'GVOTE', 'DEN', 'CYPR', 'ESTNIA', 'SW
ED', 'YEMAR',
'CZREP', 'TUNIS', 'TAIWAN', 'I0100119', 'I4122
0', 'C31', 'C311',
'I3302021', 'I13000', 'I11000', 'I1300003', 'C3
12', 'C172',
'GREECE', 'I22450', 'E411', 'I81501', 'BRAZ',

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 3/6
10/23/2019 xml2df - Jupyter Notebook

'COL', 'I42100',
'E512', 'G158', 'LATV', 'I24300', 'I83800', 'E1
21', 'I3640010',
'I95100', 'NETH', 'I34430', 'MALAY', 'C14', 'NO
RW', 'I25140',
'C41', 'C411', 'I45100', 'I63000', 'I83100', 'G
152', 'ROM',
'I32600', 'ICST', 'I42000', 'BALTST', 'LITH',
'I9741109', 'ARG',
'C16', 'E13', 'E132', 'I35000', 'C34', 'I1610
0', 'I34400', 'BANDH',
'GPRO', 'KENYA', 'I82001', 'I82002', 'I97100',
'GENT', 'WORLD',
'ASIA', 'C313', 'I01002', 'I0100223', 'OMAN',
'I83954', 'BERM',
'I34420', 'I92110', 'I81403', 'I50000', 'YUG',
'I22460', 'BSHZG',
'I83951', 'I66000', 'I8150103', 'I32550', 'I610
00', 'MEX',
'I34200', 'I8150206', 'I47500', 'I64500', 'I010
0138', 'I83600',
'G153', 'I42900', 'I49420', 'I37330', 'I50200',
'I66500', 'I34600',
'C183', 'I34000', 'I34440', 'ABDBI', 'UAE', 'C3
31', 'I34410',
'I77002', 'I0100128', 'I24700', 'I24800', 'I974
1102', 'I65200',
'I37000', 'I1610109', 'I16000', 'E131', 'GABO
N', 'E31', 'E311',
'GSEVEN', 'I82000', 'TANZA', 'SLVAK', 'ALG', 'T
URK', 'ANGOL',
'SAFR', 'UN', 'MOROC', 'I16200', 'I42800', 'I48
300', 'GSCI',
'I0100136', 'I45300', 'I5020002', 'I72101', 'I3
2000', 'VIETN',
'OPEC', 'E14', 'I83200', 'PAPNG', 'I32450', 'I5
0100', 'I8150106',
'I3302019', 'I0100144', 'I8500031', 'I32540',
'I65400', 'E511',
'E513', 'GAMB', 'I42390', 'I47520', 'I66100',
'I49540', 'OECD',
'G157', 'I15000', 'QATAR', 'GFAS', 'I34531', 'I
33030', 'E143',
'GENV', 'I43000', 'I9741105', 'I47510', 'I4910
0', 'BUL', 'IRAN',
'G156', 'LUX', 'CUBA', 'ZAMBIA', 'RWANDA', 'I65
600', 'I64300',
'I64800', 'I16300', 'I0100105', 'I41200', 'BYEL
RS', 'I8500029',

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 4/6
10/23/2019 xml2df - Jupyter Notebook

'I35102', 'NIGER', 'I26000', 'I34320', 'I010014


2', 'I35300',
'I41100', 'MCDNIA', 'I17000', 'I71000', 'ARME
N', 'AZERB',
'I8150203', 'BURMA', 'MAURTS', 'I65100', 'MONA
C', 'I36200',
'I24000', 'I31000', 'MALI', 'VEN', 'UKRN', 'I48
000', 'SYRIA',
'BOTS', 'SADCC', 'I22471', 'I22400', 'ASEAN',
'I32840', 'COSR',
'PANA', 'I49410', 'I3640045', 'I75100', 'I97411
10', 'I32510',
'I32754', 'G155', 'KAZK', 'I5020043', 'I79010',
'I0100137', 'CHAD',
'JAMA', 'DEVGCO', 'I72300', 'CHIL', 'I5010022',
'SOMAL', 'BRUNEI',
'I25510', 'I46700', 'MEAST', 'I97911', 'I0100
0', 'I8150108',
'I6560002', 'GOBIT', 'I25130', 'I5010025', 'MOZ
AM', 'MACAO',
'I65300', 'PARA', 'URU', 'ZIMBAB', 'AFGH', 'VCA
N', 'I41600',
'MOLDV', 'GWELF', 'I83700', 'GHANA', 'HON', 'KI
RGH', 'TADZK',
'TURKM', 'USSR', 'UZBK', 'I34330', 'UGANDA', 'I
32100', 'CAYI',
'I36300', 'ETHPA', 'I97400', 'PURI', 'I22472',
'DOMR', 'ICEL',
'I5020041', 'NKOREA', 'I64200', 'EUR', 'TIMOR',
'I34540', 'I34520',
'I5020044', 'I34340', 'I25110', 'MALTA', 'I2512
0', 'BURUN',
'I82003', 'NAMIB', 'I24100', 'I34100', 'I330200
4', 'I33010',
'I5020011', 'SENEG', 'I03000', 'I81401', 'GTOU
R', 'TONGA',
'I24794', 'SILEN', 'NANT', 'I32830', 'I32040',
'I3640007',
'I50300', 'I8395449', 'I8150214', 'I72102', 'I6
4000', 'MALAW',
'I8500021', 'I45000', 'I48100', 'GRGIA', 'I8100
0', 'THDWLD',
'SUDAN', 'OAU', 'LIECHT', 'WSOMOA', 'LESOT', 'T
OGO', 'I3640047',
'I92120', 'CARIB', 'LAM', 'I0100206', 'I161010
7', 'I8200316',
'I0100141', 'I77001', 'I92300', 'WEIND', 'I0100
145', 'I3302020',
'I3640048', 'I5010031', 'I8500011', 'SEASIA',

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 5/6
10/23/2019 xml2df - Jupyter Notebook

'ECU', 'NICG',
'BAHRN', 'NEPAL', 'I41900', 'LIBER', 'MAURTN',
'I25670',
'I8200318', 'ELSAL', 'HAIT', 'I32810', 'I6620
0', 'I49200', 'GTEN',
'I22470', 'KUWAIT', 'I47101', 'I64700', 'I8480
3', 'I97412',
'I83500', 'REUNI', 'I9741112', 'I25620', 'I7700
3', 'FIJI',
'I37400', 'E61', 'I1300014', 'I24500', 'I4400
0', 'I41230',
'I32300', 'I37300', 'I23000', 'I34350', 'I81501
10', 'BOL', 'MRCSL',
'I98100', 'I3302017', 'I32200', 'MALAG', 'GUA
T', 'SOLIL', 'I64600',
'DUBAI', 'I47200', 'I84801', 'I47530', 'I364002
9', 'I37100',
'BHUTAN', 'I1300002', 'I3640002', 'I83400', 'RA
KH', 'I34700',
'I6560003', 'I22300', 'I5020028', 'I22000', 'I6
540011', 'I48110',
'I6540005', 'I42210', 'CVI'], dtype=object)

In [ ]:

localhost:8888/notebooks/Desktop/sai_charan/xml2df.ipynb# 6/6