Skip to content

snippit

URL

code.snippits

text = ['My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of https://www.geeksforgeeks.org/',
        "Hello world https://www.udacity.com/", 
        "Hello World www.udacity.com", 
        "What about these URLs google.com NOPE",  
]

df = spark.createDataFrame(text, T.StringType()).toDF("text")

df = df.withColumn("url", F.regexp_extract(df.text, r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 0))          
df = df.withColumn("domain", F.regexp_extract(df.url, r'^(?:http:\/\/|www\.|https:\/\/)([^\/]+)', 0))            
df["url", "domain"](/"url", "domain").show(5, False)
files = glob.glob('./out/git_out/**/*.json',  recursive=True)
first_file = files.pop()
git_repo_df = spark.read.json(first_file)
git_repo_df.first()
for tmp_df_path in files:
    tmp_git_repo_df = spark.read.json(tmp_df_path)
    final_df = git_repo_df.unionByName(tmp_git_repo_df)
git_repo_df = parsers.df_parse_email(final_df,  "author_email")