-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexperiments.py
More file actions
66 lines (51 loc) · 2.37 KB
/
experiments.py
File metadata and controls
66 lines (51 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
import re
import json
import humanize
books = None
sefaria_url = 'http://sefaria.org/api/texts/%s?commentary=0'
sefaria_url_with_sections = 'http://sefaria.org/api/texts/%s.1-%s?commentary=0'
try:
with open('books.json') as books_file:
books = json.loads(books_file.read())
except IOError:
books = [
{'name': 'Genesis'},
{'name': 'Exodus'},
{'name': 'Leviticus'},
{'name': 'Numbers'},
{'name': 'Deuteronomy'},
]
for book in books:
meta_data = requests.get(sefaria_url % book['name']).json()
book['chapters'] = meta_data['length']
data = requests.get(sefaria_url_with_sections % (book['name'], book['chapters'])).json()
book['text'] = data['he']
with open('books.json', 'w') as books_file:
books_json = json.dumps(books, indent=4)
books_file.write(books_json)
def hebrew_letters_only(string):
'''Removes vowelization, whitespace and punctuation.'''
string = re.sub(ur'[^\u05D0-\u05EA]', '', string)
return string
def verify_code(book, first_letter, spacing, length, index=0):
entire_text = hebrew_letters_only(''.join(book['text'][0]))
print entire_text
first_index = [m.start() for m in re.finditer(first_letter,entire_text)][index]
print "Starting from the %s %s every %s letters in book %s:" % (humanize.ordinal(index+1), first_letter, spacing, book['name'])
#print "context: %s" % entire_text[first_index: first_index + (spacing * length)]
print "%s: %s" % (first_index, entire_text[first_index: first_index + (spacing * length): spacing])
verify_code(books[0], u'\u05ea', 50, 4)
verify_code(books[1], u'\u05ea', 50, 4)
verify_code(books[2], u'\u05d9', 8, 4)
verify_code(books[3], u'\u05d4', 50, 4, index=2)
verify_code(books[4], u'\u05d4', 60, 4, index=1)
entire_text = hebrew_letters_only(''.join([verse for book in books for chapter in book['text'] for verse in chapter]))
torah_re = re.compile(ur'\u05ea.{49}\u05d5.{49}\u05e8.{49}\u05d4.{49}')
# all occurences of a "torah"
result = torah_re.findall(entire_text)
torah_in_hebrew = u'\u05ea\u05d5\u05e8\u05d4'
print "%s occurences of %s found in the entire bible." % (len(result), torah_in_hebrew)
# all occurences of a "torah" backwards
result = torah_re.findall(entire_text[::-1])
print "%s backward occurences of %s found in the entire bible." % (len(result), torah_in_hebrew)