Accent-Classification/mp3_getter.py at master · abhid2001/Accent-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd


def mp3getter(lst):  # Gets all the mp3 of the given languages
    url = "http://accent.gmu.edu/soundtracks/"
    for j in range(len(lst)):
        for i in range(1, lst[j][1]+1):
            while True:
                try:
                    fname = f"{lst[j][0]}{i}"
                    mp3 = requests.get(url+fname+".mp3")
                    print(f"\nDownloading {fname}.mp3")
                    with open(f"Audio/{fname}.mp3", "wb") as audio:
                        audio.write(mp3.content)
                except:
                    # Once file finishes downloading, a buffer time to make sure next download doesn't start too early
                    time.sleep(2)
                else:
                    break  # To break the while loop


def get_num(language):  # Returns the num of samples for a given language, useful in below function
    url = 'http://accent.gmu.edu/browse_language.php?function=find&language=' + language
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    div = soup.find_all('div', 'content')
    try:
        num = int(div[0].h5.string.split()[2])
    except AttributeError:
        num = 0
    return num


# Returns a list of tuples, (lang, num), mainly used for the mp3getter function
def get_formatted_languages(languages):
    formatted_languages = []
    for language in languages:
        num = get_num(language)
        if num != 0:
            formatted_languages.append((language, num))
    return formatted_languages


def get_speaker_info(start, stop):
    '''
    Inputs: two integers, corresponding to min and max speaker id number per language
    Outputs: Pandas Dataframe containing speaker filename, birthplace, native_language, age, sex, age_onset of English
    '''
    user_data = []
    for num in range(start, stop):
        info = {'speakerid': num, 'filename': 0, 'birthplace': 1,
                'native_language': 2, 'age': 3, 'sex': 4, 'age_onset': 5}
        url = "http://accent.gmu.edu/browse_language.php?function=detail&speakerid={}".format(
            num)
        html = requests.get(url)
        soup = BeautifulSoup(html.content, 'html.parser')
        body = soup.find_all('div', attrs={'class': 'content'})
        try:
            info['filename'] = str(body[0].find('h5').text.split()[0])
            bio_bar = soup.find_all('ul', attrs={'class': 'bio'})
            info['birthplace'] = str(bio_bar[0].find_all('li')[0].text)[13:-6]
            info['native_language'] = str(
                bio_bar[0].find_all('li')[1].text.split()[2])
            info['sex'] = str(bio_bar[0].find_all(
                'li')[3].text.split()[3].strip())
            user_data.append(info)
            info['']
        except:
            info['filename'] = ''
            info['birthplace'] = ''
            info['native_language'] = ''
            info['age'] = ''
            info['sex'] = ''
            info['age_onset'] = ''
            user_data.append(info)
        print(num)
        df = pd.DataFrame(user_data)
    df.to_csv('Data/speaker_info_all.csv')


# Extracting data of the required languages from the dataset
def extract_from_data(langs):
    # Dont execute the code in comments
    df1 = pd.read_csv("Data/speaker_info_2920-2940.csv")
    df = pd.read_csv("Data/speaker_info_all.csv")
    df = df[df['native_language'].isin(langs)]
    df1 = df1[df1['native_language'].isin(langs)]
    final = pd.concat([df, df1], axis=0)
    final.drop(['Unnamed: 0','age','age_onset'], axis = 1, inplace = True)
    final.to_csv('Data/final_data_2.csv')


if __name__ == "__main__":
    langs = ['arabic', 'hindi', 'spanish']
    lang_tuple = get_formatted_languages(langs)
    print(lang_tuple)
    print('Downloading now...')
    #mp3getter(lang_tuple)
    #get_speaker_info(1, 2942)
    extract_from_data(langs)
    print("DONE!!")