analysis.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. # copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import print_function
  15. import argparse
  16. import json
  17. import os
  18. import re
  19. import traceback
  20. def parse_args():
  21. parser = argparse.ArgumentParser(description=__doc__)
  22. parser.add_argument(
  23. "--filename", type=str, help="The name of log which need to analysis.")
  24. parser.add_argument(
  25. "--log_with_profiler",
  26. type=str,
  27. help="The path of train log with profiler")
  28. parser.add_argument(
  29. "--profiler_path", type=str, help="The path of profiler timeline log.")
  30. parser.add_argument(
  31. "--keyword", type=str, help="Keyword to specify analysis data")
  32. parser.add_argument(
  33. "--separator",
  34. type=str,
  35. default=None,
  36. help="Separator of different field in log")
  37. parser.add_argument(
  38. '--position', type=int, default=None, help='The position of data field')
  39. parser.add_argument(
  40. '--range',
  41. type=str,
  42. default="",
  43. help='The range of data field to intercept')
  44. parser.add_argument(
  45. '--base_batch_size', type=int, help='base_batch size on gpu')
  46. parser.add_argument(
  47. '--skip_steps',
  48. type=int,
  49. default=0,
  50. help='The number of steps to be skipped')
  51. parser.add_argument(
  52. '--model_mode',
  53. type=int,
  54. default=-1,
  55. help='Analysis mode, default value is -1')
  56. parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
  57. parser.add_argument(
  58. '--model_name',
  59. type=str,
  60. default=0,
  61. help='training model_name, transformer_base')
  62. parser.add_argument(
  63. '--mission_name', type=str, default=0, help='training mission name')
  64. parser.add_argument(
  65. '--direction_id', type=int, default=0, help='training direction_id')
  66. parser.add_argument(
  67. '--run_mode',
  68. type=str,
  69. default="sp",
  70. help='multi process or single process')
  71. parser.add_argument(
  72. '--index',
  73. type=int,
  74. default=1,
  75. help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
  76. parser.add_argument(
  77. '--gpu_num', type=int, default=1, help='nums of training gpus')
  78. args = parser.parse_args()
  79. args.separator = None if args.separator == "None" else args.separator
  80. return args
  81. def _is_number(num):
  82. pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
  83. result = pattern.match(num)
  84. if result:
  85. return True
  86. else:
  87. return False
  88. class TimeAnalyzer(object):
  89. def __init__(self,
  90. filename,
  91. keyword=None,
  92. separator=None,
  93. position=None,
  94. range="-1"):
  95. if filename is None:
  96. raise Exception("Please specify the filename!")
  97. if keyword is None:
  98. raise Exception("Please specify the keyword!")
  99. self.filename = filename
  100. self.keyword = keyword
  101. self.separator = separator
  102. self.position = position
  103. self.range = range
  104. self.records = None
  105. self._distil()
  106. def _distil(self):
  107. self.records = []
  108. with open(self.filename, "r") as f_object:
  109. lines = f_object.readlines()
  110. for line in lines:
  111. if self.keyword not in line:
  112. continue
  113. try:
  114. result = None
  115. # Distil the string from a line.
  116. line = line.strip()
  117. line_words = line.split(
  118. self.separator) if self.separator else line.split()
  119. if args.position:
  120. result = line_words[self.position]
  121. else:
  122. # Distil the string following the keyword.
  123. for i in range(len(line_words) - 1):
  124. if line_words[i] == self.keyword:
  125. result = line_words[i + 1]
  126. break
  127. # Distil the result from the picked string.
  128. if not self.range:
  129. result = result[0:]
  130. elif _is_number(self.range):
  131. result = result[0:int(self.range)]
  132. else:
  133. result = result[int(self.range.split(":")[0]):int(
  134. self.range.split(":")[1])]
  135. self.records.append(float(result))
  136. except Exception as exc:
  137. print("line is: {}; separator={}; position={}".format(
  138. line, self.separator, self.position))
  139. print("Extract {} records: separator={}; position={}".format(
  140. len(self.records), self.separator, self.position))
  141. def _get_fps(self,
  142. mode,
  143. batch_size,
  144. gpu_num,
  145. avg_of_records,
  146. run_mode,
  147. unit=None):
  148. if mode == -1 and run_mode == 'sp':
  149. assert unit, "Please set the unit when mode is -1."
  150. fps = gpu_num * avg_of_records
  151. elif mode == -1 and run_mode == 'mp':
  152. assert unit, "Please set the unit when mode is -1."
  153. fps = gpu_num * avg_of_records #temporarily, not used now
  154. print("------------this is mp")
  155. elif mode == 0:
  156. # s/step -> samples/s
  157. fps = (batch_size * gpu_num) / avg_of_records
  158. unit = "samples/s"
  159. elif mode == 1:
  160. # steps/s -> steps/s
  161. fps = avg_of_records
  162. unit = "steps/s"
  163. elif mode == 2:
  164. # s/step -> steps/s
  165. fps = 1 / avg_of_records
  166. unit = "steps/s"
  167. elif mode == 3:
  168. # steps/s -> samples/s
  169. fps = batch_size * gpu_num * avg_of_records
  170. unit = "samples/s"
  171. elif mode == 4:
  172. # s/epoch -> s/epoch
  173. fps = avg_of_records
  174. unit = "s/epoch"
  175. else:
  176. ValueError("Unsupported analysis mode.")
  177. return fps, unit
  178. def analysis(self,
  179. batch_size,
  180. gpu_num=1,
  181. skip_steps=0,
  182. mode=-1,
  183. run_mode='sp',
  184. unit=None):
  185. if batch_size <= 0:
  186. print("base_batch_size should larger than 0.")
  187. return 0, ''
  188. if len(
  189. self.records
  190. ) <= skip_steps: # to address the condition which item of log equals to skip_steps
  191. print("no records")
  192. return 0, ''
  193. sum_of_records = 0
  194. sum_of_records_skipped = 0
  195. skip_min = self.records[skip_steps]
  196. skip_max = self.records[skip_steps]
  197. count = len(self.records)
  198. for i in range(count):
  199. sum_of_records += self.records[i]
  200. if i >= skip_steps:
  201. sum_of_records_skipped += self.records[i]
  202. if self.records[i] < skip_min:
  203. skip_min = self.records[i]
  204. if self.records[i] > skip_max:
  205. skip_max = self.records[i]
  206. avg_of_records = sum_of_records / float(count)
  207. avg_of_records_skipped = sum_of_records_skipped / float(count -
  208. skip_steps)
  209. fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
  210. run_mode, unit)
  211. fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
  212. avg_of_records_skipped, run_mode, unit)
  213. if mode == -1:
  214. print("average ips of %d steps, skip 0 step:" % count)
  215. print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
  216. print("\tFPS: %.3f %s" % (fps, fps_unit))
  217. if skip_steps > 0:
  218. print("average ips of %d steps, skip %d steps:" %
  219. (count, skip_steps))
  220. print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
  221. print("\tMin: %.3f %s" % (skip_min, fps_unit))
  222. print("\tMax: %.3f %s" % (skip_max, fps_unit))
  223. print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
  224. elif mode == 1 or mode == 3:
  225. print("average latency of %d steps, skip 0 step:" % count)
  226. print("\tAvg: %.3f steps/s" % avg_of_records)
  227. print("\tFPS: %.3f %s" % (fps, fps_unit))
  228. if skip_steps > 0:
  229. print("average latency of %d steps, skip %d steps:" %
  230. (count, skip_steps))
  231. print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
  232. print("\tMin: %.3f steps/s" % skip_min)
  233. print("\tMax: %.3f steps/s" % skip_max)
  234. print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
  235. elif mode == 0 or mode == 2:
  236. print("average latency of %d steps, skip 0 step:" % count)
  237. print("\tAvg: %.3f s/step" % avg_of_records)
  238. print("\tFPS: %.3f %s" % (fps, fps_unit))
  239. if skip_steps > 0:
  240. print("average latency of %d steps, skip %d steps:" %
  241. (count, skip_steps))
  242. print("\tAvg: %.3f s/step" % avg_of_records_skipped)
  243. print("\tMin: %.3f s/step" % skip_min)
  244. print("\tMax: %.3f s/step" % skip_max)
  245. print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
  246. return round(fps_skipped, 3), fps_unit
  247. if __name__ == "__main__":
  248. args = parse_args()
  249. run_info = dict()
  250. run_info["log_file"] = args.filename
  251. run_info["model_name"] = args.model_name
  252. run_info["mission_name"] = args.mission_name
  253. run_info["direction_id"] = args.direction_id
  254. run_info["run_mode"] = args.run_mode
  255. run_info["index"] = args.index
  256. run_info["gpu_num"] = args.gpu_num
  257. run_info["FINAL_RESULT"] = 0
  258. run_info["JOB_FAIL_FLAG"] = 0
  259. try:
  260. if args.index == 1:
  261. if args.gpu_num == 1:
  262. run_info["log_with_profiler"] = args.log_with_profiler
  263. run_info["profiler_path"] = args.profiler_path
  264. analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
  265. args.position, args.range)
  266. run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
  267. batch_size=args.base_batch_size,
  268. gpu_num=args.gpu_num,
  269. skip_steps=args.skip_steps,
  270. mode=args.model_mode,
  271. run_mode=args.run_mode,
  272. unit=args.ips_unit)
  273. try:
  274. if int(os.getenv('job_fail_flag')) == 1 or int(run_info[
  275. "FINAL_RESULT"]) == 0:
  276. run_info["JOB_FAIL_FLAG"] = 1
  277. except:
  278. pass
  279. elif args.index == 3:
  280. run_info["FINAL_RESULT"] = {}
  281. records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
  282. None, 3, '').records
  283. records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
  284. None, 5).records
  285. records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
  286. None, 3, '').records
  287. records_gm_total = TimeAnalyzer(args.filename,
  288. 'GpuMemcpy Calls',
  289. None, 4, '').records
  290. records_gm_ratio = TimeAnalyzer(args.filename,
  291. 'GpuMemcpy Calls',
  292. None, 6).records
  293. records_gmas_total = TimeAnalyzer(args.filename,
  294. 'GpuMemcpyAsync Calls',
  295. None, 4, '').records
  296. records_gms_total = TimeAnalyzer(args.filename,
  297. 'GpuMemcpySync Calls',
  298. None, 4, '').records
  299. run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
  300. 0] if records_fo_total else 0
  301. run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
  302. 0] if records_fo_ratio else 0
  303. run_info["FINAL_RESULT"][
  304. "ComputationTime_Total"] = records_ct_total[
  305. 0] if records_ct_total else 0
  306. run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
  307. 0] if records_gm_total else 0
  308. run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
  309. 0] if records_gm_ratio else 0
  310. run_info["FINAL_RESULT"][
  311. "GpuMemcpyAsync_Total"] = records_gmas_total[
  312. 0] if records_gmas_total else 0
  313. run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
  314. 0] if records_gms_total else 0
  315. else:
  316. print("Not support!")
  317. except Exception:
  318. traceback.print_exc()
  319. print("{}".format(json.dumps(run_info))
  320. ) # it's required, for the log file path insert to the database