'''
视频作者 视频分类信息分析
http://www.h4ck.org.cn
by obaby
obaby@mars
email:root@obaby.org.cn
date: 2020.09.04
'''
from pyspark.sql.functions import col
import altair as alt
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
csv = spark.read.option("header",True).csv("hdfs://localhost:9000/data2/porn_data_movie.csv")
csv.printSchema()
root
|-- id: string (nullable = true)
|-- create: string (nullable = true)
|-- update: string (nullable = true)
|-- name: string (nullable = true)
|-- describe: string (nullable = true)
|-- source_id: string (nullable = true)
|-- publish_time: string (nullable = true)
|-- play_count: string (nullable = true)
|-- good_count: string (nullable = true)
|-- bad_count: string (nullable = true)
|-- link_count: string (nullable = true)
|-- comment_count: string (nullable = true)
|-- designation: string (nullable = true)
|-- category_id: string (nullable = true)
|-- porn_site_id: string (nullable = true)
|-- uploader_id: string (nullable = true)
|-- producer: string (nullable = true)
csv.select('name', 'describe', 'uploader_id').show()