-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathReader.java
More file actions
84 lines (68 loc) · 2.39 KB
/
Reader.java
File metadata and controls
84 lines (68 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
//Reads URLS from tweets excel sheet and puts them into URL spreadsheet
package URLScrape;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class Reader {
static int total = 0;
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException {
try {
//open tweets document
FileInputStream file = new FileInputStream(new File("tweets.xlsx"));
XSSFWorkbook tweets = new XSSFWorkbook(file);
XSSFSheet sheet = tweets.getSheetAt(0);
// open URLS excel document
FileInputStream excelDoc = new FileInputStream(new File("URLS.xlsx"));
XSSFWorkbook URLworkbook = new XSSFWorkbook(excelDoc);
XSSFSheet URLSheet = URLworkbook.getSheetAt(0);
Iterator<Row> rowIterator = sheet.iterator();
for (int i = 0; i < 17411; i++) {
Row row = rowIterator.next();
Cell cell = row.getCell(7);
String tweet = cell.getStringCellValue();
// if the tweet has a website
if (tweet.contains("http")) {
total++;
// Get a string with everything after the website
String site = tweet.substring(tweet.indexOf("http") + 1);
printURL(site, row.getRowNum(), URLSheet);
// if there are multiple URLS
while (site.contains("http")) {
total++;
site = site.substring(site.indexOf("http") + 1);
printURL(site, row.getRowNum(), URLSheet);
}
}
}
file.close();
tweets.close();
URLworkbook.close();
FileOutputStream output_file = new FileOutputStream("URLS.xlsx");
URLworkbook.write(output_file);
output_file.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
static public void printURL(String substring, int rowNum, XSSFSheet URLSheet) {
// ignore everything after a space or newline
String[] site2 = substring.split(" ");
String[] siteURL = site2[0].split("\n");
String finishedURL = "h" + siteURL[0];
//enter URL
Row row = URLSheet.createRow(total);
Cell URLcell = row.createCell(0);
URLcell.setCellValue((String) finishedURL);
//enter original row in sheet
Cell ORowCell = row.createCell(1);
ORowCell.setCellValue((int) rowNum + 1);
}
}