PyBibTextTools/SpringerCsv2Bib.py at master · larcc-group/PyBibTextTools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import sys
import tempfile
from shutil import copyfile
import unidecode
import argparse
import pandas as pd
import csv
import io
sys.path.insert(0, "./pybtex/")
from pybtex.database import BibliographyData, Entry, Person
from pybtex.database import parse_file, parse_string


def author_fix(author_tmp):
    # Problems with Spring CSV.
    """
    "Sergey Ablameyko PhD, DSc, Prof, FIEE, FIAPR, SMIEEETony Pridmore BSc, PhD"
    correct is
    "Sergey Ablameyko and Tony Pridmore"
    """
    author_tmp = author_tmp.replace(",", " ")
    author_tmp = author_tmp.replace("PhD", "")
    author_tmp = author_tmp.replace("DSc", "")
    author_tmp = author_tmp.replace("Prof", "")
    author_tmp = author_tmp.replace("FIEE", "")
    author_tmp = author_tmp.replace("FIAPR", "")
    author_tmp = author_tmp.replace("SMIEEE", "")
    author_tmp = author_tmp.replace("  ", " ")

    """
    "Yingying ZhuCong YaoXiang Bai"
    correct is
    "Yingying Zhu and Cong Yao and Xiang Bai"
    """
    last_word_isalpha = False
    author = ""
    for word in author_tmp:
        is_uppercase = word.isupper() and word.isalpha()
        if is_uppercase and last_word_isalpha:
            author = author + " and "
        author = author + word
        last_word_isalpha = word.islower() and word.isalpha()

    return author


def run(csv_file_name, bib_file_name):
    if not os.path.isfile(csv_file_name):
        print("File not found: ", csv_file_name)
        return

    # I dont kown Why, but dont work complex path in Panda, then I copy file to local path.
    tmp_file = tempfile.mktemp()
    copyfile(csv_file_name, tmp_file)

    colnames = [
        "title",
        "journal",
        "book",
        "volume",
        "issue",
        "doi",
        "author",
        "year",
        "url",
        "type",
    ]
    pn = pd.read_csv(tmp_file, names=colnames, skiprows=1)

    bib_data = BibliographyData()
    total = 0
    not_author = 0

    for row_index, row in pn.iterrows():
        total = total + 1
        fields = []
        if not pd.isnull(row.title):
            fields.append(("title", row.title))
        if not pd.isnull(row.journal):
            fields.append(("journal", row.journal))
        if not pd.isnull(row.volume):
            fields.append(("volume", str(row.volume)))
        if not pd.isnull(row.volume):
            fields.append(("issue", str(row.issue)))
        if not pd.isnull(row.doi):
            fields.append(("doi", row.doi))
        if not pd.isnull(row.year):
            fields.append(("year", str(row.year)))
        if not pd.isnull(row.url):
            fields.append(("url", row.url))
        if not pd.isnull(row.author):
            fields.append(("author", author_fix(row.author)))

        key_paper = row.doi
        type_paper = row.type  # TypePaperSelect(row.type)
        print("Chave " + key_paper + "               \r", end="", flush=True)

        if pd.isnull(row.author):
            not_author = not_author + 1
        else:
            bib_data.entries[key_paper] = Entry(type_paper, fields)

    print("Processed: ", total, "                             ")
    print("Removed without author: ", not_author)
    print("Total Final: ", len(bib_data.entries))

    bib_data.to_file(bib_file_name, bib_format="bibtex")
    print("Saved file: ", bib_file_name)


ap = argparse.ArgumentParser()
ap.add_argument("-c", "--csvFileName", required=True, help="CSV file name")
ap.add_argument("-b", "--bibFileName", required=True, help="BibText file name")

args = vars(ap.parse_args())

run(args["csvFileName"], args["bibFileName"])