diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 3d8c52220f1ff2ec774e9f64b347b38c46ef6262..72d58421223ddb6fd066dae2bbc66c1a532f6fa0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -59,12 +59,28 @@ static struct perf_header *header;
 
 static u64		sample_type;
 
-static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask)
+
+static size_t
+callchain__fprintf_left_margin(FILE *fp, int left_margin)
+{
+	int i;
+	int ret;
+
+	ret = fprintf(fp, "            ");
+
+	for (i = 0; i < left_margin; i++)
+		ret += fprintf(fp, " ");
+
+	return ret;
+}
+
+static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
+					  int left_margin)
 {
 	int i;
 	size_t ret = 0;
 
-	ret += fprintf(fp, "%s", "                ");
+	ret += callchain__fprintf_left_margin(fp, left_margin);
 
 	for (i = 0; i < depth; i++)
 		if (depth_mask & (1 << i))
@@ -79,12 +95,12 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask)
 static size_t
 ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth,
 		       int depth_mask, int count, u64 total_samples,
-		       int hits)
+		       int hits, int left_margin)
 {
 	int i;
 	size_t ret = 0;
 
-	ret += fprintf(fp, "%s", "                ");
+	ret += callchain__fprintf_left_margin(fp, left_margin);
 	for (i = 0; i < depth; i++) {
 		if (depth_mask & (1 << i))
 			ret += fprintf(fp, "|");
@@ -123,7 +139,8 @@ static void init_rem_hits(void)
 
 static size_t
 __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
-			   u64 total_samples, int depth, int depth_mask)
+			   u64 total_samples, int depth, int depth_mask,
+			   int left_margin)
 {
 	struct rb_node *node, *next;
 	struct callchain_node *child;
@@ -164,7 +181,8 @@ __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		 * But we keep the older depth mask for the line seperator
 		 * to keep the level link until we reach the last child
 		 */
-		ret += ipchain__fprintf_graph_line(fp, depth, depth_mask);
+		ret += ipchain__fprintf_graph_line(fp, depth, depth_mask,
+						   left_margin);
 		i = 0;
 		list_for_each_entry(chain, &child->val, list) {
 			if (chain->ip >= PERF_CONTEXT_MAX)
@@ -172,11 +190,13 @@ __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 			ret += ipchain__fprintf_graph(fp, chain, depth,
 						      new_depth_mask, i++,
 						      new_total,
-						      cumul);
+						      cumul,
+						      left_margin);
 		}
 		ret += __callchain__fprintf_graph(fp, child, new_total,
 						  depth + 1,
-						  new_depth_mask | (1 << depth));
+						  new_depth_mask | (1 << depth),
+						  left_margin);
 		node = next;
 	}
 
@@ -190,17 +210,19 @@ __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 
 		ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
 					      new_depth_mask, 0, new_total,
-					      remaining);
+					      remaining, left_margin);
 	}
 
 	return ret;
 }
 
+
 static size_t
 callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
-			 u64 total_samples)
+			 u64 total_samples, int left_margin)
 {
 	struct callchain_list *chain;
+	bool printed = false;
 	int i = 0;
 	int ret = 0;
 
@@ -208,17 +230,27 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		if (chain->ip >= PERF_CONTEXT_MAX)
 			continue;
 
-		if (!i++ && sort_by_sym_first)
+		if (!i++ && sort__first_dimension == SORT_SYM)
 			continue;
 
+		if (!printed) {
+			ret += callchain__fprintf_left_margin(fp, left_margin);
+			ret += fprintf(fp, "|\n");
+			ret += callchain__fprintf_left_margin(fp, left_margin);
+			ret += fprintf(fp, "---");
+
+			left_margin += 3;
+			printed = true;
+		} else
+			ret += callchain__fprintf_left_margin(fp, left_margin);
+
 		if (chain->sym)
-			ret += fprintf(fp, "                %s\n", chain->sym->name);
+			ret += fprintf(fp, " %s\n", chain->sym->name);
 		else
-			ret += fprintf(fp, "                %p\n",
-					(void *)(long)chain->ip);
+			ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
 	}
 
-	ret += __callchain__fprintf_graph(fp, self, total_samples, 1, 1);
+	ret += __callchain__fprintf_graph(fp, self, total_samples, 1, 1, left_margin);
 
 	return ret;
 }
@@ -251,7 +283,7 @@ callchain__fprintf_flat(FILE *fp, struct callchain_node *self,
 
 static size_t
 hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
-			      u64 total_samples)
+			      u64 total_samples, int left_margin)
 {
 	struct rb_node *rb_node;
 	struct callchain_node *chain;
@@ -271,7 +303,8 @@ hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 			break;
 		case CHAIN_GRAPH_ABS: /* Falldown */
 		case CHAIN_GRAPH_REL:
-			ret += callchain__fprintf_graph(fp, chain, total_samples);
+			ret += callchain__fprintf_graph(fp, chain, total_samples,
+							left_margin);
 		case CHAIN_NONE:
 		default:
 			break;
@@ -316,8 +349,19 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 
 	ret += fprintf(fp, "\n");
 
-	if (callchain)
-		hist_entry_callchain__fprintf(fp, self, total_samples);
+	if (callchain) {
+		int left_margin = 0;
+
+		if (sort__first_dimension == SORT_COMM) {
+			se = list_first_entry(&hist_entry__sort_list, typeof(*se),
+						list);
+			left_margin = se->width ? *se->width : 0;
+			left_margin -= thread__comm_len(self->thread);
+		}
+
+		hist_entry_callchain__fprintf(fp, self, total_samples,
+					      left_margin);
+	}
 
 	return ret;
 }
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 60ced707bd6b5a88993fb206e098fe1fcee9cd72..b490354d1b23d31e1d6a977c4801b2f34c97acea 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -7,7 +7,8 @@ char		default_sort_order[] = "comm,dso,symbol";
 char		*sort_order = default_sort_order;
 int		sort__need_collapse = 0;
 int		sort__has_parent = 0;
-int		sort_by_sym_first;
+
+enum sort_type	sort__first_dimension;
 
 unsigned int dsos__col_width;
 unsigned int comms__col_width;
@@ -266,9 +267,18 @@ int sort_dimension__add(const char *tok)
 			sort__has_parent = 1;
 		}
 
-		if (list_empty(&hist_entry__sort_list) &&
-		    !strcmp(sd->name, "symbol"))
-			sort_by_sym_first = true;
+		if (list_empty(&hist_entry__sort_list)) {
+			if (!strcmp(sd->name, "pid"))
+				sort__first_dimension = SORT_PID;
+			else if (!strcmp(sd->name, "comm"))
+				sort__first_dimension = SORT_COMM;
+			else if (!strcmp(sd->name, "dso"))
+				sort__first_dimension = SORT_DSO;
+			else if (!strcmp(sd->name, "symbol"))
+				sort__first_dimension = SORT_SYM;
+			else if (!strcmp(sd->name, "parent"))
+				sort__first_dimension = SORT_PARENT;
+		}
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 24c2b709f0d3921c420c966fe53aad11f94280e3..333e664ff45fddfe9838f113dda25dc0a26c3860 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -39,7 +39,7 @@ extern struct sort_entry sort_parent;
 extern unsigned int dsos__col_width;
 extern unsigned int comms__col_width;
 extern unsigned int threads__col_width;
-extern int sort_by_sym_first;
+extern enum sort_type sort__first_dimension;
 
 struct hist_entry {
 	struct rb_node		rb_node;
@@ -54,6 +54,14 @@ struct hist_entry {
 	struct rb_root		sorted_chain;
 };
 
+enum sort_type {
+	SORT_PID,
+	SORT_COMM,
+	SORT_DSO,
+	SORT_SYM,
+	SORT_PARENT
+};
+
 /*
  * configurable sorting bits
  */
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index f53fad7c0a8d4a1fde9cceff0a8ac750b8d849b2..8cb47f1d8a763dea687d4b21c0587a68b4f117b7 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -33,6 +33,17 @@ int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
+int thread__comm_len(struct thread *self)
+{
+	if (!self->comm_len) {
+		if (!self->comm)
+			return 0;
+		self->comm_len = strlen(self->comm);
+	}
+
+	return self->comm_len;
+}
+
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
 	struct rb_node *nd;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 1abef3b7455dc17d4fd5c0d3b6f39c20564219c3..53addd77ce8f2f1c800f9471b8d11f4ce8a6fddb 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -12,9 +12,11 @@ struct thread {
 	pid_t			pid;
 	char			shortname[3];
 	char			*comm;
+	int			comm_len;
 };
 
 int thread__set_comm(struct thread *self, const char *comm);
+int thread__comm_len(struct thread *self);
 struct thread *threads__findnew(pid_t pid);
 struct thread *register_idle_thread(void);
 void thread__insert_map(struct thread *self, struct map *map);