-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_columns.py
86 lines (70 loc) · 2.3 KB
/
search_columns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 29 12:25:57 2017
@author: julio
"""
import urllib.request
from lxml import html
#import requests
import re
from itertools import groupby
from operator import itemgetter
import numpy
from collections import Counter
#from parse import *
#from urllib.parse import urlparse
#from urllib.parse import search
import os
cwd = os.getcwd()
file = 'file://'+cwd+'/myfile5.html'
myfile5 = urllib.request.urlopen(file).read()
myfile5 = myfile5.decode("utf-8")
tree = html.fromstring(myfile5)
data = tree.xpath('(.|.//*[not(name()="script")][not(name()="style")])/text()')
data_correct=data[0:-1]
for i,n in enumerate(data_correct):
if '\n' not in n:
data_correct[i] = data_correct[i] + ' ' + data_correct[i+1]
#data_correct.append(n + data[data.index(n)+1])
del data_correct[i+1]
while '\n' in data_correct: data_correct.remove('\n')
for i,n in enumerate(data_correct):
data_correct[i] = data_correct[i].strip()
#%% Find attributes
listnumbers = []
top = re.findall("div style.*top:([0-9]+)px.", myfile5)
left = re.findall("div style.*left:([0-9]+)px.", myfile5)
titles = []
for i,n in enumerate(data_correct):
if re.search("(^([A-Z]{3,}( [0-9]+)*(.*%)*)(.*[A-Z]+)*)",n):
if re.search("([A-Z]{3,}.*) (([A-Z]+))",n):
p = re.search("([A-Z]{3,}.*) (([A-Z]+))",n)
titles.append(p.group(1))
titles.append(p.group(2))
else:
titles.append(n)
#%%
bigdic = {}
titles_needed = ['RANG','(GEBURTEN [0-9]+)','NAME','STRASSE','ORT']
s = " ".join(titles)
for n in titles_needed:
if re.search(n,s):
p = re.findall(n,s)
for m in p:
bigdic[m] = 0
orte = []
for n in data_correct:
orte.append(re.findall('^([A-Z][a-z]+)',n))
def checkforsequence(data):
for n in data:
if data_correct[i].isdigit() and data_correct[i+1].isdigit() and data_correct[i+2].isdigit():
dif = abs(data_correct[i] - data_correct[i+1])
listnumbers.append(n)
listnumbers = []
for n in data_correct:
listnumbers.append(data_correct[n].isdigit())
diffs = []
for n in range(len(listnumbers)):
diffs.append(abs(n-int(listnumbers[n])))
most_common,num_most_common = Counter(diffs).most_common(1)[0]