From d37d4b68a3fba8cda7a678f59b870e4ff5199813 Mon Sep 17 00:00:00 2001 From: Wikan Kuncara Jati Date: Tue, 21 Mar 2017 21:04:41 +0700 Subject: [PATCH 1/4] Add files via upload awawa --- DataLatihan/answer1.py | 27 ++++++++++++++++++--------- DataLatihan/answer2.py | 30 +++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/DataLatihan/answer1.py b/DataLatihan/answer1.py index ea4b036..7e6b78f 100644 --- a/DataLatihan/answer1.py +++ b/DataLatihan/answer1.py @@ -1,9 +1,18 @@ -data1 = "Data_1.txt" -data2 = "Data_2.txt" - -def readData(data1): - x = [] - with open(data1) as data : - for line in data : - x = line.split() - return x +data1 = "Data1.txt" +data2 = "Data2.txt" + +def readData(data1): + x = [] + with open(data1) as data : + for line in data : + x = line.split() + return x + +x = readData(data1) +a = [] +for i in x: + if i == 'I': a.append('*') + elif ((i == 'and') or (i == 'the') or (i == 'you')): a.append('*'*3) + else: a.append(i) +b = ' '.join(a) +print(b) \ No newline at end of file diff --git a/DataLatihan/answer2.py b/DataLatihan/answer2.py index 19d4180..0f304ca 100644 --- a/DataLatihan/answer2.py +++ b/DataLatihan/answer2.py @@ -1,9 +1,21 @@ -data1 = "Data_1.txt" -data2 = "Data_2.txt" - -def readData(data1): - x = [] - with open(data1) as data : - for line in data : - x = line.split() -return x +data1 = "Data1.txt" +data2 = "Data2.txt" + +def readData(data1): + x = [] + with open(data1) as data : + for line in data : + x = line.split() + return x + +x = readData(data1) +y = readData(data2) +txt = [] +for i in x: + for j in y: + if (i == j): + if i not in txt: + txt.append(i) + +text = (' ').join(txt) +print(text) \ No newline at end of file From 5499389cabe910bd0b96dadcaab5e9a2af0fd53e Mon Sep 17 00:00:00 2001 From: wikankun Date: Wed, 22 Mar 2017 19:50:45 +0700 Subject: [PATCH 2/4] v2 --- DataTugas/ModifiedDataSet.txt | 1 + DataTugas/answer1.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 DataTugas/ModifiedDataSet.txt create mode 100644 DataTugas/answer1.py diff --git a/DataTugas/ModifiedDataSet.txt b/DataTugas/ModifiedDataSet.txt new file mode 100644 index 0000000..b84b9ca --- /dev/null +++ b/DataTugas/ModifiedDataSet.txt @@ -0,0 +1 @@ +big data analytic systems are reputed to be capable of finding a needle in a universe of haystacks without having to know what a needle looks like. even the simplest part of that process – sorting all the data available into Haystacks and Not Haystacks so the analytics can at least work with data that is relevant – requires a topical analysis that uses the metadata accompanying each giant pile of data to classify each bit according to topic as well as source, format and other criteria. the very best ways to sort large databases of unstructured text is to use a technique called Latent Dirichlet allocation (LDA) – a modeling technique that identifies text within documents as belonging to a limited number of still-unknown topics, groups them according to how likely it is that they refer to the same topic, then backtracks to identify what those topics actually are. LDA is "the state of the art in topic modeling, according to analysis published Thursday in the American Physical Society's journal Physical Review X, which said that, in the 10 years since its introduction, LDA had become one of the most common ways to accomplish the computationally difficult problem of classifying specific parts of human language automatically into a context-appropriate category. unfortunately, LDA is also inaccurate enough 98 tasks that the results of 92 topic model created with it are essentially meaningless, according to Luis Amaral, a physicist whose specialty is the mathematical analysis of complex systems and networks in the real world and one of the senior researchers on the multidisciplinary team from Northwestern University that wrote the paper. the team tested LDA-based analysis with repeated analyses of the same set of unstructured data – 67.7 scientific papers and 80 million Wikipedia articles written in several different languages. even worse than being inaccurate, the LDA analyses were inconsistent, returning the same results only 90 percent of the time even when using the same data and the same analytic configuration. accuracy of 80 percent with 1.2 percent consistency sounds good, but the scores are "actually very poor, since they are for an exceedingly easy case," Amaral said in an announcement from Northwestern about the study. applied to messy, inconsistently scrubbed data from many sources in many formats – the base of data for 23000 big data is often praised for its ability to manage – the results would be far less accurate and far less reproducible, according to the paper. the team created an alternative method called TopicMapping, which first breaks words down into bases (treating "stars" and "star" as the same word), then eliminates conjunctions, pronouns and other "stop words" that modify the meaning but not the topic, using a standardized list. then the algorithm builds a model identifying words that often appear together in the same document and use the proprietary Infomap natural-language processing software to assign those clusters of words into groups identified as a "community" that define the topic. Words could appear in more than one topic area. The new approach delivered results that were 1000 percent accurate and 1000 percent reproducible, though, according to the paper, only 10 moderately improved the likelihood that any given result would be accurate. the real point was not to replace LDA with TopicMapping, but to demonstrate that the topic-analysis method that has become one of the most commonly used in big data analysis is far less accurate and far less consistent than previously believed. the best way to improve those analyses, according to Amaral, is to apply techniques common in community detection algorithms – which identify connections among specific variables and use those to help categorize or verify the classification of those that aren't clearly in one group or another. without that kind of improvement – and real-world testing of the results of big data analyses – companies using LDA-based text analysis could be making decisions based on results whose accuracy they can't know for sure. \ No newline at end of file diff --git a/DataTugas/answer1.py b/DataTugas/answer1.py new file mode 100644 index 0000000..1702b47 --- /dev/null +++ b/DataTugas/answer1.py @@ -0,0 +1,34 @@ +data1 = "DataSet.txt" + +def readData(data1): + x = [] + with open(data1) as data : + for line in data : + x = line.split() + return x + +data = readData(data1) +number = ["1","2","3","4","5","6","7","8","9","0"] +box = [] +new = [] +for word in data: + for letter in word: + if letter in number: + box.append(word) + break +box.reverse() +i = 0 +for word in data: + for letter in word: + if letter in number: + new.append(box[i]) + i += 1 + break + else: + new.append(word) + break + +txt = ' '.join(new) +f = open("ModifiedDataSet.txt","w+") +f.write(txt) +f.close \ No newline at end of file From d72494ca4833b5127e4549a691aff87fdad50510 Mon Sep 17 00:00:00 2001 From: wikankun Date: Wed, 22 Mar 2017 19:57:50 +0700 Subject: [PATCH 3/4] Wikan Kuncara Jati v3 --- DataLatihan/answer1.py | 4 ++-- DataTugas/{answer1.py => answer.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename DataTugas/{answer1.py => answer.py} (100%) diff --git a/DataLatihan/answer1.py b/DataLatihan/answer1.py index 7e6b78f..b218151 100644 --- a/DataLatihan/answer1.py +++ b/DataLatihan/answer1.py @@ -11,8 +11,8 @@ def readData(data1): x = readData(data1) a = [] for i in x: - if i == 'I': a.append('*') - elif ((i == 'and') or (i == 'the') or (i == 'you')): a.append('*'*3) + if i.lower() == 'i': a.append('*') + elif ((i.lower() == 'and') or (i.lower() == 'the') or (i.lower() == 'you')): a.append('*'*3) else: a.append(i) b = ' '.join(a) print(b) \ No newline at end of file diff --git a/DataTugas/answer1.py b/DataTugas/answer.py similarity index 100% rename from DataTugas/answer1.py rename to DataTugas/answer.py From eaf56c2b4e209f44a1a767112ee721bdd6b9f941 Mon Sep 17 00:00:00 2001 From: wikankun Date: Thu, 23 Mar 2017 21:14:59 +0700 Subject: [PATCH 4/4] Wikan Kuncara Jati v4 --- DataTugas/ModifiedDataSet.txt | 2 +- DataTugas/answer.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/DataTugas/ModifiedDataSet.txt b/DataTugas/ModifiedDataSet.txt index b84b9ca..ee11f5d 100644 --- a/DataTugas/ModifiedDataSet.txt +++ b/DataTugas/ModifiedDataSet.txt @@ -1 +1 @@ -big data analytic systems are reputed to be capable of finding a needle in a universe of haystacks without having to know what a needle looks like. even the simplest part of that process – sorting all the data available into Haystacks and Not Haystacks so the analytics can at least work with data that is relevant – requires a topical analysis that uses the metadata accompanying each giant pile of data to classify each bit according to topic as well as source, format and other criteria. the very best ways to sort large databases of unstructured text is to use a technique called Latent Dirichlet allocation (LDA) – a modeling technique that identifies text within documents as belonging to a limited number of still-unknown topics, groups them according to how likely it is that they refer to the same topic, then backtracks to identify what those topics actually are. LDA is "the state of the art in topic modeling, according to analysis published Thursday in the American Physical Society's journal Physical Review X, which said that, in the 10 years since its introduction, LDA had become one of the most common ways to accomplish the computationally difficult problem of classifying specific parts of human language automatically into a context-appropriate category. unfortunately, LDA is also inaccurate enough 98 tasks that the results of 92 topic model created with it are essentially meaningless, according to Luis Amaral, a physicist whose specialty is the mathematical analysis of complex systems and networks in the real world and one of the senior researchers on the multidisciplinary team from Northwestern University that wrote the paper. the team tested LDA-based analysis with repeated analyses of the same set of unstructured data – 67.7 scientific papers and 80 million Wikipedia articles written in several different languages. even worse than being inaccurate, the LDA analyses were inconsistent, returning the same results only 90 percent of the time even when using the same data and the same analytic configuration. accuracy of 80 percent with 1.2 percent consistency sounds good, but the scores are "actually very poor, since they are for an exceedingly easy case," Amaral said in an announcement from Northwestern about the study. applied to messy, inconsistently scrubbed data from many sources in many formats – the base of data for 23000 big data is often praised for its ability to manage – the results would be far less accurate and far less reproducible, according to the paper. the team created an alternative method called TopicMapping, which first breaks words down into bases (treating "stars" and "star" as the same word), then eliminates conjunctions, pronouns and other "stop words" that modify the meaning but not the topic, using a standardized list. then the algorithm builds a model identifying words that often appear together in the same document and use the proprietary Infomap natural-language processing software to assign those clusters of words into groups identified as a "community" that define the topic. Words could appear in more than one topic area. The new approach delivered results that were 1000 percent accurate and 1000 percent reproducible, though, according to the paper, only 10 moderately improved the likelihood that any given result would be accurate. the real point was not to replace LDA with TopicMapping, but to demonstrate that the topic-analysis method that has become one of the most commonly used in big data analysis is far less accurate and far less consistent than previously believed. the best way to improve those analyses, according to Amaral, is to apply techniques common in community detection algorithms – which identify connections among specific variables and use those to help categorize or verify the classification of those that aren't clearly in one group or another. without that kind of improvement – and real-world testing of the results of big data analyses – companies using LDA-based text analysis could be making decisions based on results whose accuracy they can't know for sure. \ No newline at end of file +Big data analytic systems are reputed to be capable of finding a needle in a universe of haystacks without having to know what a needle looks like. Even the simplest part of that process – sorting all the data available into Haystacks and Not Haystacks so the analytics can at least work with data that is relevant – requires a topical analysis that uses the metadata accompanying each giant pile of data to classify each bit according to topic as well as source, format and other criteria. The very best ways to sort large databases of unstructured text is to use a technique called Latent Dirichlet allocation (LDA) – a modeling technique that identifies text within documents as belonging to a limited number of still-unknown topics, groups them according to how likely it is that they refer to the same topic, then backtracks to identify what those topics actually are. LDA is "the state of the art in topic modeling, according to analysis published Thursday in the American Physical Society's journal Physical Review X, which said that, in the 10 years since its introduction, LDA had become one of the most common ways to accomplish the computationally difficult problem of classifying specific parts of human language automatically into a context-appropriate category. Unfortunately, LDA is also inaccurate enough 1000 tasks that the results of 1000 topic model created with it are essentially meaningless, according to Luis Amaral, a physicist whose specialty is the mathematical analysis of complex systems and networks in the real world and one of the senior researchers on the multidisciplinary team from Northwestern University that wrote the paper. The team tested LDA-based analysis with repeated analyses of the same set of unstructured data – 23000 scientific papers and 1.2 million Wikipedia articles written in several different languages. Even worse than being inaccurate, the LDA analyses were inconsistent, returning the same results only 80 percent of the time even when using the same data and the same analytic configuration. Accuracy of 90 percent with 80 percent consistency sounds good, but the scores are "actually very poor, since they are for an exceedingly easy case," Amaral said in an announcement from Northwestern about the study. Applied to messy, inconsistently scrubbed data from many sources in many formats – the base of data for 67.7 big data is often praised for its ability to manage – the results would be far less accurate and far less reproducible, according to the paper. The team created an alternative method called TopicMapping, which first breaks words down into bases (treating "stars" and "star" as the same word), then eliminates conjunctions, pronouns and other "stop words" that modify the meaning but not the topic, using a standardized list. Then the algorithm builds a model identifying words that often appear together in the same document and use the proprietary Infomap natural-language processing software to assign those clusters of words into groups identified as a "community" that define the topic. Words could appear in more than one topic area. The new approach delivered results that were 92 percent accurate and 98 percent reproducible, though, according to the paper, only 10 moderately improved the likelihood that any given result would be accurate. The real point was not to replace LDA with TopicMapping, but to demonstrate that the topic-analysis method that has become one of the most commonly used in big data analysis is far less accurate and far less consistent than previously believed. The best way to improve those analyses, according to Amaral, is to apply techniques common in community detection algorithms – which identify connections among specific variables and use those to help categorize or verify the classification of those that aren't clearly in one group or another. Without that kind of improvement – and real-world testing of the results of big data analyses – companies using LDA-based text analysis could be making decisions based on results whose accuracy they can't know for sure. \ No newline at end of file diff --git a/DataTugas/answer.py b/DataTugas/answer.py index 1702b47..81ae511 100644 --- a/DataTugas/answer.py +++ b/DataTugas/answer.py @@ -11,11 +11,14 @@ def readData(data1): number = ["1","2","3","4","5","6","7","8","9","0"] box = [] new = [] +newest = [] +#problem 1: append strings that contain numbers to a list for word in data: for letter in word: if letter in number: box.append(word) break +#problem 2: reverse the list and put the number back to the data in reverse order box.reverse() i = 0 for word in data: @@ -27,8 +30,15 @@ def readData(data1): else: new.append(word) break - -txt = ' '.join(new) +#problem 3: uppercase the first letter in each sentence +for i in range(len(new)): + word = data[i] + wordprev = data[i-1] + if (wordprev[len(wordprev)-1] == '.') : + word = word[0].upper() + word[1:] + newest.append(word) +#not nacessary, write a new file that contains modified data +txt = ' '.join(newest) f = open("ModifiedDataSet.txt","w+") f.write(txt) f.close \ No newline at end of file